psindex: Parameter Stability Index

# parse psindex syntax
# YR 14 Jan 2014: move to lav_syntax.R

lavParseModelString <- function(model.syntax = '', as.data.frame. = FALSE,
                                warn = TRUE, debug = FALSE) {
  
    # check for empty syntax
    if(length(model.syntax) == 0) {
        stop("psindex ERROR: empty model syntax")
    }
    
    # remove comments prior to split. 
    # Match from comment character to newline, but don't eliminate newline
    model.syntax <- gsub("[#!].*(?=\n)","", model.syntax, perl=TRUE)
  
    # replace semicolons with newlines prior to split
    model.syntax <- gsub(";", "\n", model.syntax, fixed=TRUE)
  
    #remove whitespace prior to split
    model.syntax <- gsub("[ \t]+", "", model.syntax, perl=TRUE)
    # remove any occurrence of >= 2 consecutive newlines to eliminate \
    # blank statements; this retains a blank newline at the beginning, 
    # if such exists, but parser will not choke because of start.idx
    model.syntax <- gsub("\n{2,}", "\n", model.syntax, perl=TRUE)
  
    # break up in lines 
    model <- unlist( strsplit(model.syntax, "\n") )
    
    # check for multi-line formulas: they contain no "~" or "=" character
    # but before we do that, we remove all modifiers
    # to avoid confusion with for example equal("f1=~x1") statements
    model.simple <- gsub("\\(.*\\)\\*", "MODIFIER*", model)
  
    start.idx <- grep("[~=<>:|%]", model.simple)

    # check for empty start.idx: no operator found (new in 0.6-1)
    if(length(start.idx) == 0L) {
        stop("psindex ERROR: model does not contain psindex syntax (no operators found)")
    }

    # check for non-empty string, without an operator in the first lines
    # (new in 0.6-1)
    if(start.idx[1] > 1L) {
        # two possibilities:
        # - we have an empty line (ok)
        # - the element contains no operator (warn!)
        for(el in 1:(start.idx[1] - 1L)) {
            # not empty?
            if(nchar(model.simple[el]) > 0L) {
                warning("psindex WARNING: no operator found in this syntax line: ", model.simple[el], "\n", "                  This syntax line will be ignored!")
            }
        }
    }

    end.idx <- c( start.idx[-1]-1, length(model) )
    model.orig    <- model
    model <- character( length(start.idx) )
    for(i in 1:length(start.idx)) {
        model[i] <- paste(model.orig[start.idx[i]:end.idx[i]], collapse="")
    }
  
    # ok, in all remaining lines, we should have a '~' operator
    # OR one of '=', '<', '>', '|' outside the ""
    model.simple <- gsub("\\\".[^\\\"]*\\\"", "LABEL", model)
    idx.wrong <- which(!grepl("[~=<>:|%]", model.simple))
    if(length(idx.wrong) > 0) {
        cat("psindex: missing operator in formula(s):\n")
        print(model[idx.wrong])
        stop("psindex ERROR: syntax error in psindex model syntax")
    }

    # but perhaps we have a '+' as the first character?
    idx.wrong <- which(grepl("^\\+", model))
    if(length(idx.wrong) > 0) {
        cat("psindex: some formula(s) start with a plus (+) sign:\n")
        print(model[idx.wrong])
        stop("psindex ERROR: syntax error in psindex model syntax")
    }
  
  
    # main operation: flatten formulas into single bivariate pieces
    # with a left-hand-side (lhs), an operator (eg "=~"), and a 
    # right-hand-side (rhs)
    # both lhs and rhs can have a modifier 
    # (but we ignore the lhs modifier for now)
    FLAT.lhs         <- character(0)
    #FLAT.lhs.mod    <- character(0)
    FLAT.op          <- character(0)
    FLAT.rhs         <- character(0)
    FLAT.rhs.mod.idx <- integer(0)
    FLAT.block       <- integer(0)    # keep track of groups using ":" operator
  
    FLAT.fixed       <- character(0)  # only for display purposes! 
    FLAT.start       <- character(0)  # only for display purposes!
    FLAT.label       <- character(0)  # only for display purposes!
    FLAT.prior       <- character(0)
    FLAT.idx <- 0L
    MOD.idx  <- 0L
    CON.idx  <- 0L
    MOD <- vector("list", length=0L)
    CON <- vector("list", length=0L)
    BLOCK <- 1L
    BLOCK_OP <- FALSE
    for(i in 1:length(model)) {
        x <- model[i]
        if(debug) {
           cat("formula to parse:\n"); print(x); cat("\n")
        }
    
        # 1. which operator is used?
        line.simple <- gsub("\\\".[^\\\"]*\\\"", "LABEL", x)
        # "=~" operator?
        if(grepl("=~", line.simple, fixed=TRUE)) {
            op <- "=~"
        # "<~" operator?
        } else if(grepl("<~", line.simple, fixed=TRUE)) {
            op <- "<~"
        } else if(grepl("~*~", line.simple, fixed=TRUE)) {
            op <- "~*~"
        # "~~" operator?
        } else if(grepl("~~", line.simple, fixed=TRUE)) {
            op <- "~~"
        # "~" operator?
        } else if(grepl("~", line.simple, fixed=TRUE)) {
            op <- "~"           
        # "==" operator?
        } else if(grepl("==", line.simple, fixed=TRUE)) {
            op <- "=="  
        # "<" operator?
        } else if(grepl("<", line.simple, fixed=TRUE)) {
            op <- "<"
        # ">" operator?
        } else if(grepl(">", line.simple, fixed=TRUE)) {
            op <- ">"
        # ":=" operator?
        } else if(grepl(":=", line.simple, fixed=TRUE)) {
            op <- ":="
        # ":" operator?
        } else if(grepl(":", line.simple, fixed=TRUE)) {
            op <- ":"
        # "|" operator?
        } else if(grepl("|", line.simple, fixed=TRUE)) {
            op <- "|"
        # "%" operator?
        } else if(grepl("%", line.simple, fixed=TRUE)) {
            op <- "%"
        } else {
            stop("unknown operator in ", model[i])
        }
    
        # 2. split by operator (only the *first* occurence!)
        # check first if equal/label modifier has been used on the LEFT!
        if(substr(x,1,6) == "label(") 
            stop("label modifier can not be used on the left-hand side of the operator")
        if(op == "|") {
            op.idx <- regexpr("\\|", x)
        } else if(op == "~*~") {
            op.idx <- regexpr("~\\*~", x)    
        } else {
            op.idx <- regexpr(op, x)
        }
        lhs <- substr(x, 1L, op.idx-1L)
        # fix for 'NA' names in lhs; not likely to happen to ov.names
        # since 'NA' is not a valid name for list elements/data.frame columns
        if(lhs == "NA") lhs <- "NA."
        rhs <- substr(x, op.idx+attr(op.idx, "match.length"), nchar(x))

        # check if first character is '+'; if so, remove silently
        if(substr(rhs, 1, 1) == "+") {
            rhs <- substr(rhs, 2, nchar(rhs))
        }

        # 2b. if operator is "==" or "<" or ">" or ":=", put it in CON
        if(op == "==" || op == "<" || op == ">" || op == ":=") {
            # remove quotes, if any
            lhs <- gsub("\\\"", "", lhs)
            rhs <- gsub("\\\"", "", rhs)
            CON.idx <- CON.idx + 1L
            CON[[CON.idx]] <- list(op=op, lhs=lhs, rhs=rhs, user=1L)
            next
        }
    
        # 2c if operator is ":", put it in BLOCK
        if(op == ":") {
            FLAT.idx <- FLAT.idx + 1L
            FLAT.lhs[FLAT.idx] <- lhs
            FLAT.op[ FLAT.idx] <- op
            FLAT.rhs[FLAT.idx] <- rhs
            FLAT.fixed[FLAT.idx] <- ""
            FLAT.start[FLAT.idx] <- ""
            FLAT.label[FLAT.idx] <- ""
            FLAT.prior[FLAT.idx] <- ""
            FLAT.rhs.mod.idx[FLAT.idx] <- 0L
            if(BLOCK_OP) {
                BLOCK <- BLOCK + 1L
            }
            FLAT.block[FLAT.idx] <- BLOCK
            BLOCK_OP <- TRUE
            next
        }
    
        # 3. parse left hand
        #    lhs modifiers will be ignored for now
        lhs.formula <- as.formula(paste("~",lhs))
        out <- lav_syntax_parse_rhs(rhs=lhs.formula[[2L]])
        lhs.names <- names(out)
        # check if we have modifiers
        if(sum(sapply(out, length)) > 0L) {
            warning("psindex WARNING: left-hand side of formula below contains modifier:\n", x,"\n")
        }
    
        # 4. lav_syntax_parse_rhs (as rhs of a single-sided formula)

        # new 0.5-12: before we do this, replace '0.2?' by 'start(0.2)*'
        # requested by the simsem folks
        rhs <- gsub('\\(?([-]?[0-9]*\\.?[0-9]*)\\)?\\?',"start(\\1)\\*", rhs)
        rhs.formula <- as.formula(paste("~",rhs))
        out <- lav_syntax_parse_rhs(rhs=rhs.formula[[2L]],op=op)

        if(debug) print(out)
    
        # for each lhs element
        for(l in 1:length(lhs.names)) {
      
            # for each rhs element
            for(j in 1:length(out)) {
        
                # catch intercepts
                if(names(out)[j] == "intercept") {
                    if(op == "~") {
                        rhs.name <- ""
                    } else {
                        stop("psindex ERROR: right-hand side of formula contains an intercept, but operator is \"", op, "\" in: ", x)
                    }
                } else if(names(out)[j] == "..zero.." && op == "~") {
                    rhs.name <- ""
                } else if(names(out)[j] == "..constant.." && op == "~") {
                    rhs.name <- ""
                } else {
                    rhs.name <- names(out)[j]
                }

                # move this 'check' to post-parse 
                #if(op == "|") {
                #    th.name <- paste("t", j, sep="")
                #    if(names(out)[j] != th.name) {
                #        stop("psindex ERROR: threshold ", j, " of variable ", 
                #             sQuote(lhs.names[1]), " should be named ",
                #             sQuote(th.name), "; found ", 
                #             sQuote(names(out)[j]), "\n")
                #    }
                #}

                # catch lhs = rhs and op = "=~"
                if(op == "=~" && lhs.names[l] == names(out)[j]) {
                    stop("psindex ERROR: latent variable `", lhs.names[l], "' can not be measured by itself")
                }
        
                # check if we not already have this combination (in this group)
                # 1. asymmetric (=~, ~, ~1)
                if(op != "~~") {
                    idx <- which(FLAT.lhs == lhs.names[l] &
                                 FLAT.op  == op &
                                 FLAT.block == BLOCK &
                                 FLAT.rhs == rhs.name)
                    if(length(idx) > 0L) {
                        stop("psindex ERROR: duplicate model element in: ", model[i])
                    }
                } else {
                    # 2. symmetric (~~)
                    idx <- which(FLAT.lhs == rhs.name &
                                 FLAT.op  == "~~" &
                                 FLAT.block == BLOCK &
                                 FLAT.rhs == lhs.names[l])
                    if(length(idx) > 0L) {
                        stop("psindex ERROR: duplicate model element in: ", model[i])
                    }
                }
                FLAT.idx <- FLAT.idx + 1L
                FLAT.lhs[FLAT.idx] <- lhs.names[l]
                FLAT.op[ FLAT.idx] <- op
                FLAT.rhs[FLAT.idx] <- rhs.name
                FLAT.block[FLAT.idx] <- BLOCK
                FLAT.fixed[FLAT.idx] <- ""
                FLAT.start[FLAT.idx] <- ""
                FLAT.label[FLAT.idx] <- ""
                FLAT.prior[FLAT.idx] <- ""
        
                mod <- list()
                rhs.mod <- 0L
                if(length(out[[j]]$fixed) > 0L) {
                    mod$fixed <- out[[j]]$fixed
                    FLAT.fixed[FLAT.idx] <- paste(mod$fixed, collapse=";")
                    rhs.mod <- 1L
                }
                if(length(out[[j]]$start) > 0L) {
                    mod$start <- out[[j]]$start
                    FLAT.start[FLAT.idx] <- paste(mod$start, collapse=";")
                    rhs.mod <- 1L
                }
                if(length(out[[j]]$label) > 0L) {
                    mod$label <- out[[j]]$label
                    FLAT.label[FLAT.idx] <- paste(mod$label, collapse=";")
                    rhs.mod <- 1L
                }
                if(length(out[[j]]$prior) > 0L) {
                    mod$prior <- out[[j]]$prior
                    FLAT.prior[FLAT.idx] <- paste(mod$prior, collapse=";")
                    rhs.mod <- 1L
                }
                #if(op == "~1" && rhs == "0") {
                #    mod$fixed <- 0
                #    FLAT.fixed[FLAT.idx] <- paste(mod$fixed, collapse=";")
                #    rhs.mod <- 1L
                #}
                if(op == "=~" && rhs == "0") {
                    mod$fixed <- 0
                    FLAT.rhs[FLAT.idx] <- FLAT.lhs[FLAT.idx]
                    FLAT.fixed[FLAT.idx] <- paste(mod$fixed, collapse=";")
                    rhs.mod <- 1L
                }
        
                FLAT.rhs.mod.idx[FLAT.idx] <- rhs.mod
        
                if(rhs.mod > 0L) {
                    MOD.idx <- MOD.idx + 1L
                    MOD[[MOD.idx]] <- mod
                }
           } # rhs elements
        } # lhs elements
    } # model elements
  
    # enumerate modifier indices
    mod.idx <- which(FLAT.rhs.mod.idx > 0L)
    FLAT.rhs.mod.idx[ mod.idx ] <- 1:length(mod.idx)
  
    FLAT <- list(lhs=FLAT.lhs, op=FLAT.op, rhs=FLAT.rhs,
                 mod.idx=FLAT.rhs.mod.idx, block=FLAT.block,
                 fixed=FLAT.fixed, start=FLAT.start,
                 label=FLAT.label, prior=FLAT.prior)

    # change op for intercepts (for convenience only)
    int.idx <- which(FLAT$op == "~" & FLAT$rhs == "")
    if(length(int.idx) > 0L) {
        FLAT$op[int.idx] <- "~1"
    }

    # new in 0.6, reorder covariances here!
    FLAT <- lav_partable_covariance_reorder(FLAT)
  
    if(as.data.frame.) {
        FLAT <- as.data.frame(FLAT, stringsAsFactors=FALSE)
    }
  
    attr(FLAT, "modifiers") <- MOD
    attr(FLAT, "constraints") <- CON
  
    FLAT
}

lav_syntax_parse_rhs <- function(rhs, op="") {

    # new version YR 15 dec 2011!
    # - no 'equal' field anymore (only labels!)
    # - every modifier is evaluated
    # - unquoted labels are allowed (eg. x1 + x2 + c(v1,v2,v3)*x3)

    # fill in rhs list
    out <- list()
    repeat {
        if(length(rhs) == 1L) { # last one and only a single element
            out <- c(vector("list", 1L), out)
            NAME <- all.vars(rhs)
            if(length(NAME) > 0L) {
                names(out)[1L] <- NAME
            } else { # intercept or zero?
                if(as.character(rhs) == "1") {
                    names(out)[1L] <- "intercept"
                } else if(as.character(rhs) == "0") {
                    names(out)[1L] <- "..zero.."
                    out[[1L]]$fixed <- 0
                } else {
                    names(out)[1L] <- "..constant.."
                    out[[1L]]$fixed <- 0 
                }
            }
            break
        } else if(rhs[[1L]] == "*") { # last one, but with modifier
            out <- c(vector("list", 1L), out)
            NAME <- all.vars(rhs[[3L]])

            if(length(NAME) > 0L) { # not an intercept
                # catch interaction term
                rhs3.names <- all.names(rhs[[3L]])
                if(rhs3.names[1L] == ":") {
                    NAME <- paste(NAME[1L], ":", NAME[2L], sep = "")
                }
                names(out)[1L] <- NAME
            } else { # intercept
                names(out)[1L] <- "intercept"
            }
            i.var <- all.vars(rhs[[2L]], unique=FALSE)
            if(length(i.var) > 0L) {
                # modifier are unquoted labels
                out[[1L]]$label <- i.var
            } else {
                # modifer is something else
                out[[1L]] <- lav_syntax_get_modifier(rhs[[2L]])
            }
            break
        } else if(rhs[[1L]] == ":") { # last one, but interaction term
            out <- c(vector("list", 1L), out)
            NAME <- all.vars(rhs)
            NAME <- paste(NAME[1L], ":", NAME[2L], sep = "")
            names(out)[1L] <- NAME
            break
        } else if(rhs[[1L]] == "+") { # not last one!
            i.var <- all.vars(rhs[[3L]], unique=FALSE)
            n.var <- length(i.var)

            # catch interaction term
            rhs3.names <- all.names(rhs[[3L]])
            if(length(i.var) > 1L && ":" %in% rhs3.names) {
               colon.idx <- which(rhs3.names == ":")
               i.var <- i.var[seq_len(n.var - 1L)]
               n.var <- n.var - 1L
               i.var[n.var] <- paste(rhs3.names[colon.idx + 1L], ":",
                                     rhs3.names[colon.idx + 2L], sep = "")
            }

            out <- c(vector("list", 1L), out)
            if(length(i.var) > 0L) {
                names(out)[1L] <- i.var[n.var]
            } else {
                names(out)[1L] <- "intercept"
            }
            if(n.var > 1L) { 
                # modifier are unquoted labels
                out[[1L]]$label <- i.var[-n.var]
            } else if(length(rhs[[3L]]) == 3L && rhs3.names[1L] == "*") {
                # modifiers!!
                out[[1L]] <- lav_syntax_get_modifier(rhs[[3L]][[2L]])
            }

            # next element
            rhs <- rhs[[2L]]
        } else {
            stop("psindex ERROR: I'm confused parsing this line: ", rhs, "\n")
        }
    }

    # if multiple elements, check for duplicated elements and merge if found
    if(length(out) > 1L) {
        rhs.names <- names(out)
        while( !is.na(idx <- which(duplicated(rhs.names))[1L]) ) {
            dup.name <- rhs.names[ idx ]
            orig.idx <- match(dup.name, rhs.names)
            merged <- c( out[[orig.idx]], out[[idx]] )
            if(!is.null(merged)) # be careful, NULL will delete element
                out[[orig.idx]] <- merged
            out <- out[-idx]
            rhs.names <- names(out)
        }
    }

    # if thresholds, check order and reorder if necessary
    #if(op == "|") {
    #    t.names <- names(out)
    #    idx <- match(sort(t.names), t.names)
    #    out <- out[idx]
    #}

    out
}


lav_syntax_get_modifier <- function(mod) {

    if(length(mod) == 1L) {
        # three possibilites: 1) numeric, 2) NA, or 3) quoted character
        if( is.numeric(mod) ) 
            return( list(fixed=mod) )
        if( is.na(mod) ) 
            return( list(fixed=as.numeric(NA)) )
        if( is.character(mod) )
            return( list(label=mod) )
    } else if(mod[[1L]] == "start") {
        cof <- unlist(lapply(as.list(mod)[-1], 
                             eval, envir=NULL, enclos=NULL))
        return( list(start=cof) )
    } else if(mod[[1L]] == "equal") {
        label <- unlist(lapply(as.list(mod)[-1],    
                        eval, envir=NULL, enclos=NULL))
        return( list(label=label) )
    } else if(mod[[1L]] == "label") {
        label <- unlist(lapply(as.list(mod)[-1],
                        eval, envir=NULL, enclos=NULL))
        label[is.na(label)] <- "" # catch 'NA' elements in a label
        return( list(label=label) )
    } else if(mod[[1L]] == "prior") {
        prior <- unlist(lapply(as.list(mod)[-1],
                        eval, envir=NULL, enclos=NULL))
        return( list(prior=prior) )
    } else if(mod[[1L]] == "c") {
        # vector: we allow numeric and character only!
        cof <- unlist(lapply(as.list(mod)[-1],    
                             eval, envir=NULL, enclos=NULL))
        if(all(is.na(cof))) {
             return( list(fixed=rep(as.numeric(NA), length(cof))) )
        } else if(is.numeric(cof)) 
             return( list(fixed=cof) )
        else if(is.character(cof)) {
             cof[is.na(cof)] <- "" # catch 'NA' elements in a label
             return( list(label=cof) )
        } else {
            stop("psindex ERROR: can not parse modifier:", mod, "\n")
        }
    } else {
        # unknown expression
        # as a final attempt, we will evaluate it and coerce it
        # to either a numeric or character (vector)
        cof <- try( eval(mod, envir=NULL, enclos=NULL), silent=TRUE)
        if(is.numeric(cof))
             return( list(fixed=cof) )
        else if(is.character(cof))
             return( list(label=cof) )
        else {
            stop("psindex ERROR: can not parse modifier:", mod, "\n")
        }
    }
}