R/naiveSplit.R
In semtree: Recursive Partitioning for Structural Equation Models

naiveSplit <-
  function(model = NULL,
           mydata = NULL,
           control = NULL,
           invariance = NULL,
           meta = NULL,
           pp = FALSE,
           constraints = NULL,
           ...) {
    n.comp <- 0
    LL.btn <- c()
    split.btn <- c()
    cov.btn.names <- c()
    cov.btn.cols <- c()
    cov.type <- c()
    cov.col <- c()
    cov.name <- c()
    
    LL.within <- c()
    within.split <- c()
    #Baseline model to compare subgroup fits to
    ###########################################################
    ###               OPENMX USED HERE                      ###
    ###########################################################
    if (control$sem.prog == 'OpenMx') {
      modelnew <- mxAddNewModelData(model, mydata, name = "BASE MODEL")
      LL.overall <- safeRunAndEvaluate(modelnew)
      suppressWarnings(if (is.na(LL.overall))
        return(NULL))
    }
    ###########################################################
    ###               lavaan USED HERE                      ###
    ###########################################################
    if (control$sem.prog == 'lavaan') {
      #if (control$verbose) {message("Assessing overall model")}
      modelnew <-
        eval(parse(
          text = paste(
            "lavaan::",model@Options$model.type,
            '(lavaan::parTable(model),data=mydata,missing=\'',
            model@Options$missing,
            '\',do.fit=F)',
            sep = ""
          )
        ))
      #modelnew <- lavaan(parTable(model),data=mydata,model.type=model@Options$model.type,do.fit=FALSE)
      LL.overall <- safeRunAndEvaluate(modelnew)
      suppressWarnings(if (is.na(LL.overall))
        return(NULL))
    }

    if (pp) {
      comparedData <- max(meta$model.ids + 1)
    }
    else {
      comparedData <- meta$covariate.ids
    }
    for (cur_col in comparedData) {
      # parent model's likelihood (LL.baseline) is adjusted
      # for each split if there is missing data
      LL.baseline <- LL.overall
      missingModel <- missingDataModel(modelnew, mydata, cur_col)
      if (!is.null(missingModel)) {
        LL.baseline <- safeRunAndEvaluate(missingModel)
      }
      if (control$report.level > 10) {
        report(paste("Estimating baseline likelihood: ", LL.baseline), 1)
      }
      
      
      # tell the user a little bit about where we are right now
      if (control$verbose) {
        ui_message("Testing Predictor: ",
                   colnames(mydata)[cur_col])
      }
      ############################################################
      #case for factored covariates##############################
      if (is.factor(mydata[, cur_col])) {
        #unordered factors#####################################
        if (!is.ordered(mydata[, cur_col])) {
          var.type = .SCALE_CATEGORICAL
          
          val.sets <- unique(mydata[, cur_col])
          if (length(val.sets) > 1) {
            #create binaries for comparison of all combinations
            result <-
              recodeAllSubsets(mydata[, cur_col], colnames(mydata)[cur_col])
            test1 <- c()
            test2 <- rep(NA, length(mydata[, cur_col]))
            
            for (j in 1:ncol(result$columns)) {
              for (i in 1:length(mydata[, cur_col])) {
                if (isTRUE(result$columns[i, j])) {
                  test1[i] <- 1
                }
                else if (!is.na(result$columns[i, j])) {
                  test1[i] <- 0
                }
                else{
                  test1[i] <- NA
                }
              }
              test1 <- as.factor(test1)
              test2 <- data.frame(test2, test1)
            }
            test2 <- test2[, -1]
           
            for (i in 1:(result$num_sets)) {
              LL.temp <- c()
              #subset data for chosen value and store LL
              if (result$num_sets == 1) {
                subset1 <- subset (mydata, as.numeric(test2) == 2)
                subset2 <- subset (mydata, as.numeric(test2) == 1)
              }
              else {
                subset1 <- subset (mydata, as.numeric(test2[[i]]) == 2)
                subset2 <- subset (mydata, as.numeric(test2[[i]]) == 1)
              }
              
              # refit baseline model with focus parameters @TAGX
              if (!is.null(constraints) &
                  (!is.null(constraints$focus.parameters))) {
                LL.baseline <- fitSubmodels(model,
                                            subset1,
                                            subset2,
                                            control,
                                            invariance = constraints$focus.parameters)
                if (control$report.level > 10) {
                  report(
                    paste(
                      "Reestimating baseline likelihood with focus parameters: ",
                      LL.baseline
                    ),
                    1
                  )
                }
              }
              #browser()
              LL.return <-
                fitSubmodels(model, subset1, subset2, control, invariance = NULL)
              if (!is.na(LL.return)) {
                LL.within <- cbind(LL.within, (LL.baseline - LL.return))
                within.split <- cbind(within.split, i)
                cov.col <- cbind(cov.col, cur_col)
                cov.name <- cbind(cov.name, colnames(mydata[cur_col]))
                cov.type <- cbind(cov.type, var.type)
                n.comp <- n.comp + 1
              }
            }
          }
        }
        #ordered factors#########################################
        if (is.ordered(mydata[, cur_col])) {
          var.type = .SCALE_ORDINAL
          
          val.sets <- sort(unique(mydata[, cur_col]))
          
          if (length(val.sets) > 1) {
            for (i in 2:(length(val.sets))) {
              LL.temp <- c()
              #subset data for chosen value and store LL
              #cond1 <- as.numeric(as.character(mydata[,cur_col])) > (val.sets[i]+val.sets[(i-1)])/2
              #cond2 <- as.numeric(as.character(mydata[,cur_col])) < (val.sets[i]+val.sets[(i-1)])/2
             
              cond1 <- mydata[, cur_col] > val.sets[i - 1]
              cond2 <- mydata[, cur_col] <= val.sets[i - 1]
              subset1 <- subset (mydata, cond1)
              subset2 <- subset (mydata, cond2)
              
              # refit baseline model with focus parameters @TAGX
              if (!is.null(constraints) &
                  (!is.null(constraints$focus.parameters))) {
                LL.baseline <- fitSubmodels(model,
                                            subset1,
                                            subset2,
                                            control,
                                            invariance = constraints$focus.parameters)
                if (control$report.level > 10) {
                  report(
                    paste(
                      "Reestimating baseline likelihood with focus parameters: ",
                      LL.baseline
                    ),
                    1
                  )
                }
              }
              
              LL.return <-
                fitSubmodels(model, subset1, subset2, control, invariance = NULL)
              if (!is.na(LL.return)) {
                LL.within <- cbind(LL.within, (LL.baseline - LL.return))
                #within.split <- cbind(within.split, (val.sets[i]+val.sets[(i-1)])/2)

                within.split <- cbind(within.split, as.character(val.sets[i - 1]))
                cov.col <- cbind(cov.col, cur_col)
                cov.name <- cbind(cov.name, colnames(mydata[cur_col]))
                cov.type <- cbind(cov.type, var.type)
                n.comp <- n.comp + 1
              }
            }
          }
        }
      }

      #numeric (continuous) covariates################################
      if (is.numeric(mydata[, cur_col])) {
        var.type = .SCALE_METRIC
        v <- as.numeric(mydata[, cur_col])
        val.sets <- sort(unique(v))
        
        if (length(val.sets) > 1) {
          for (i in 2:(length(val.sets))) {
            LL.temp <- c()
            #subset data for chosen value and store LL
            cond1 <-
              as.numeric(mydata[, cur_col]) > (val.sets[i] + val.sets[(i - 1)]) / 2
            cond2 <-
              as.numeric(mydata[, cur_col]) < (val.sets[i] + val.sets[(i - 1)]) / 2
            subset1 <- subset (mydata, cond1)
            subset2 <- subset (mydata, cond2)
            
            #catch LLR for each comparison
            
            # refit baseline model with focus parameters @TAGX
            if (!is.null(constraints) &
                (!is.null(constraints$focus.parameters))) {
              LL.baseline <- fitSubmodels(model,
                                          subset1,
                                          subset2,
                                          control,
                                          invariance = constraints$focus.parameters)
              if (control$report.level > 10) {
                report(
                  paste(
                    "Reestimating baseline likelihood with focus parameters: ",
                    LL.baseline
                  ),
                  1
                )
              }
            }
            
            LL.return <-
              fitSubmodels(model, subset1, subset2, control, invariance = NULL)
            # cat("LLreturn:",LL.return," and value split at:",(val.sets[i]+val.sets[(i-1)])/2,"\n")
            if (!is.na(LL.return)) {
              LL.within <- cbind(LL.within, (LL.baseline - LL.return))
              within.split <-
                cbind(within.split, (val.sets[i] + val.sets[(i - 1)]) / 2)
              cov.col <- cbind(cov.col, cur_col)
              cov.name <- cbind(cov.name, colnames(mydata[cur_col]))
              cov.type <- cbind(cov.type, var.type)
              n.comp <- n.comp + 1
            }
            
          }
        }
        
      }
    }
    
    
    if (is.null(LL.within)) {
      return(NULL)
    }
    
    btn.matrix <- rbind(LL.within, cov.name, cov.col, within.split)
    colnames(btn.matrix) <-
      c(paste("var", seq(1, ncol(btn.matrix)), sep = ""))
    rownames(btn.matrix) <- c("LR", "variable", "column", "split val")
    
    filter <- c()
    if (!is.null(invariance)) {
      if (control$verbose) {
        ui_message("Filtering possible splits by invariance")
      }
      filter <-
        invarianceFilter(model, mydata, btn.matrix, LL.baseline, invariance, control)
    }
    
    # find best
    
    LL.max <- NA
    split.max <- NA
    name.max <- NA
    col.max <- NA
    type.max <- NA
    for (cur_col in 1:ncol(LL.within)) {
      if (!is.null(filter)) {
        if (!is.na(filter[1, cur_col])) {
          if (is.na(LL.max)) {
            LL.max <- LL.within[cur_col]
            split.max <- within.split[cur_col]
            name.max <- cov.name[cur_col]
            col.max <- cov.col[cur_col]
            type.max <- cov.type[cur_col]
          }
          else if (LL.within[cur_col] > LL.max) {
            LL.max <- LL.within[cur_col]
            split.max <- within.split[cur_col]
            name.max <- cov.name[cur_col]
            col.max <- cov.col[cur_col]
            type.max <- cov.type[cur_col]
          }
        }
      }
      else {
        if (!is.na(LL.within[cur_col])) {
          if (is.na(LL.max)) {
            LL.max <- LL.within[cur_col]
            split.max <- within.split[cur_col]
            name.max <- cov.name[cur_col]
            col.max <- cov.col[cur_col]
            type.max <- cov.type[cur_col]
          }
          else if (LL.within[cur_col] > LL.max) {
            LL.max <- LL.within[cur_col]
            split.max <- within.split[cur_col]
            name.max <- cov.name[cur_col]
            col.max <- cov.col[cur_col]
            type.max <- cov.type[cur_col]
          }
        }
      }
    }
    
    if (control$verbose & control$report.level==99) {
      
      cat("LL.within:",paste0(LL.within,collapse=","),"\n")
      cat("LL.max: ", paste0(LL.max,collapse=","),"\n")
      cat("within.split: ",paste0(within.split,collapse=","),"\n" )
      cat("split max",split.max,"\n")
    }
    
    # alternative way of counting the number of comparisons
    # count the number of variables instead of tests
    if (control$naive.bonferroni.type == 1) {
      n.comp <- length(comparedData)
    }
    
    
    if (is.na(LL.max)) {
      return(NULL)
    }
    else
      (
        return(
          list(
            LL.max = LL.max,
            split.max = split.max,
            name.max = name.max,
            col.max = col.max,
            type.max = type.max,
            n.comp = n.comp,
            btn.matrix = btn.matrix,
            invariance.filter = filter
          )
        )
      )
  }