R/netSEMp1.R
In netSEM: Network Structural Equation Modeling

Documented in netSEMp1

##' This function carries out netSEM using principle 1
##'
##' netSEM builds a network model of multiple continuous variables using principle 1.
##' Principle 1 determines the univariate relationships in the spirit of the Markovian process. In this case, the relationship between each pair of system variables, including predictors and the system level response is determined with the Markovian property that assumes the value of the current predictor is sufficient in relating to the next level variable, i.e., the relationship is independent of the specific value of the preceding-level variable to the current predictor, given the current value. 
##' Each pair of variables is tested for sensible (in the domain knowledge sense) paring relation chosen from 7 pre-selected common functional forms in linear regression settings. 
##' Adjusted R-squared is used for model selection for every pair.
##'
##' P-values reported in the "res.print" field of the return list contains the P-values of estimators of linear regression coefficients. 
##' The P-values are ordered in the common order of coefficients, i.e. in the order of increasing exponents. 
##' For example, in the quadratic functional form y ~ b0 + b1x + b2x^2, the three P-values correspond two those of \\hat{b0}, \\hat{b1} and \\hat{b2}, respectively. 
##' If there are less than 3 coefficients to estimate, the extra P-value field is filled with NA's. 
##'
##' @title network Structural Equation Modeling (netSEM)
##' 
##' @import segmented 
##' @importFrom stats predict
##' @importFrom stats lm
##' @importFrom stats residuals
##' @importFrom stats runif
##' @importFrom stats var
##' 
##' @param x A dataframe. By default it considers all columns as exogenous variables, except the first column which stores the system's endogenous variable. 
##' @param exogenous, by defult it considers all columns as exogenous variables except column number 1, which is the main endogenous response. 
##' @param endogenous A character string of the column name of the main endogenous OR a numeric number indexing the column of the main endogenous.
##' @param nlsInits a data frame of initial vectors for nls. Each column corresponds to a coefficient. The data frame can be generated by the genInit() function. Each row is one initial vector. Currently the only nls function included is y = a + b * exp(c * x).
##' @param str A boolean, whether or not this is a 'strength' type problem
##' @return An object of class netSEM, which is a list of the following items:
##'
##' \itemize{
##' \item "table": A matrix. For each row, first column is the endogenous variable, second column is the predictor, the other columns show corresponding summary information: Best functional form, R-squared, adj-R-squared, P-value1, P-value2 and P-value3. The P-values correspond to those of estimators of linear regression coefficients. See details. 
##' \item "bestModels": A matrix. First dimension indicates predictors. The second dimension indicates endogenous variables. The i-jth cell of the matrix stores the name of the best functional form corresponding to the j-th endogenous variable regressed on the i-th predictor.
##' \item "allModels":  A three dimensional list. The first dimension indicates predictors. The second dimension indicates endogenous variables. Third dimension indicates the fitting results of all 6 functional forms. The i-j-k-th cell of the list stores a "lm" object, corresponding to the j-th endogenous, i-th predictor and the k-th functional form.
##' }
##'
##' The object has two added attributes:
##'
##' \itemize{
##'
##' \item "attr(res.best, "Step")": A vector. For each variable, it shows in which step it is choosen to be significantly related to endogenous variable.
##'
##' \item "attr(res.best, "diag.Step")": A matrix. First dimension is predictors; second dimension is endogenous variables. Each cell shows in which step the pairwise relation is being fitted.
##' }
##'
##' @export
##'
##' @examples
##' \dontrun{
##' ## Load the sample acrylic data set
##' data(acrylic)
##'
##' ## Run netSEM
##' ans <- netSEMp1(acrylic)
##' }

netSEMp1 <- function(x,
                    exogenous = NULL,
                    endogenous = NULL,
                    nlsInits = data.frame(a1 = 1, a2 = 1, a3 = 1),
                    str=FALSE) {

  # Reorder the dataframe if the locations of exogenous and endogenous are not defined:
  #   This defines the 'default' behaviour, and enables a dataframe that is ordered as:
  #     [main response, stressor, unit level variables]  (the new requested ordering)
  #   To instead appear as:
  #     [stressor, main response, unit level variables]  (the old style ordering)
  #   Which the original sgSEM code expects, and was hard coded to accommodate
  # Note that defining the locations of exogenous and endogenous stressors explicitly may cause problems


####### Index Dependent Data Frame Reordering ###########  
  if (missing(exogenous) & missing(endogenous)) {
    x <- x[, c(2, 1, 3:ncol(x))]
  }

  ###############################
  ### Checking exogenous and main endogenous
  ###############################
  if (!missing(exogenous)) {
    if (!is.character(exogenous)) {
      if (is.wholenumber(exogenous)) {  ## Apply 'is.wholenumber' function
        if (exogenous < 1 | exogenous > length(colnames(x)))
          stop("exogenous location out of range!")
          exogenous.loc <- exogenous
      }
    } else { 
      if (!(exogenous %in% colnames(x))) {
          stop(paste0("exogenous '", exogenous, "' does not exist!"))
      }
      exogenous.loc <- which(colnames(x) == exogenous)
    }
    neworder <- 1:length(colnames(x))
    neworder[-2] <- exogenous.loc
    neworder[exogenous.loc] <- 1
    x <- x[neworder]
  } else {
    exogenous <- names(x)[-2]  ## It takes all columns as stressors (exogenous) except col2
  }

  if (!missing(endogenous)) {
    endogenous.loc <- which(colnames(x) == endogenous)
    if (!is.character(endogenous)) {
      if (is.wholenumber(endogenous)) {
        if (endogenous < 1 | endogenous > length(colnames(x)))
          stop("endogenous location out of range!")
          endogenous.loc <- endogenous
        }
    } else {
      if (!(endogenous %in% colnames(x))) {
          stop(paste0("endogenous '", endogenous, "' does not exist!"))
      }
      endogenous.loc <- which(colnames(x) == endogenous)
    }
    neworder <- 1:length(colnames(x))
    neworder[2] <- endogenous.loc
    neworder[endogenous.loc] <- 2
    x <- x[neworder]
  } else {
    endogenous <- names(x)[2]
  }
#####################################################################
  #cat("\nExogenous is:",exogenous)
  #cat("\nEndogenous is:",endogenous,"\n")

  #############################################################################
  ### Main scripts; Above function is called
  #############################################################################

  # Total number of variables
  nVar <- ncol(x)

  # Total number of possible endogenous variables
  # Given that exactly one variable is exogenous, the main stressor (ex: time)
  # NOTE: this hard-coding will need to change for multiple level stressors
  nRVar <- nVar - 1

  ## Current considered functional forms (order matters)
  if (str) {
    modelNames <- c("SL", "Quad", "SQuad", "Exp", "Log", "SQRoot", "ISQRoot",
                    "nls","CP")
  } else {
    modelNames <- c("SL", "Quad", "SQuad", "Exp", "Log", "nls","CP")
  }
  # Number of possible models (sets dimensions downstream)
  nModel <- length(modelNames)

  ## Initiate Res.all (a list object to return, all possible results in a 3 dimensional array of matrices)
  Res.all <- vector( "list", nVar * nVar * nModel) # List stores the final results
  dim(Res.all) <- c(nVar, nVar, nModel)
  Res.all[, , ] <- NA
  dimnames(Res.all) <- list(colnames(x), colnames(x), modelNames)
  # Dimensions are: [number of variables, number of variables, number of models]

  ## Initiate Res.best (a matrix of results to return)
  #   contains 'best' possible model results from Res.all in a 2 dimensional matrix
  #   and two 'attributes'
  #    1. a vector 'step', which...
  #    2. a matrix 'diag.step', which...
  Res.best <- matrix(rep(NA, nVar * nVar), nrow = nVar, 
                     dimnames = list(colnames(x), colnames(x)) )
  # Declare attribute #1 of Res.best
  attr(Res.best, "Step") <- c(Inf, 0, rep(Inf, nVar - 2))
  names(attributes(Res.best)$Step) <- colnames(x)
  # Declare attribute #2 of Res.best
  attr(Res.best, "diag.Step") <- matrix(rep(Inf, nVar*nVar), nrow = nVar)
  dimnames(attributes(Res.best)$diag.Step) <- list(colnames(x), colnames(x))
  # Dimensions of matrix are: [number of variables, number of variables]
  # Dimensions of attr 1 are: [number of variables]
  # Dimensions of attr 2 are: [number of variables, number of variables]

  nRes <- 8 # Number of cells in print variable; see below

  ## Initiate Res.print (a matrix of model statistic values to return)
  ## matrix stores final restults, including information of best model name,
  ##   and the values: R-Sqr, adj-R-Sqr, Pval1, Pval2, Pval3
  Res.print <- matrix(NA, nrow = 0, ncol = nRes)
  colnames(Res.print) <- c("endogenous", "Variable", "Model", "R-Sqr", 
                           "adj-R-Sqr", "Pval1", "Pval2", "Pval3")
  # Dimensions of matrix are: [number of relationships that pass the defined cut off threshold, 8 (number of values to report)]


  ## Indicator of possible endogenous variables have already been tested
  fitted.flag <- rep(FALSE, nRVar)
  ## Indicator of possible endogenous varibals need to be tested
  tofit.flag <- c(TRUE, rep(FALSE, nRVar - 1))

  # A list to hold findrelation() results
  res <- list()

  while (sum(tofit.flag - fitted.flag) != 0) {
    iResp <- which(tofit.flag != fitted.flag)[1] + 1
    iVar <- (1:nVar)[c(-2, -iResp)]
    # This line is meant to directly change the Res tables
    # retlist <- lapply(iVar, function(x, y) findrelation(x,y), y = iResp)
    # Instead, pass them into and retrieve them out of an externalized function call:

    for (i in iVar) {
      frfit <- findrelation(iVar = i, iResp = iResp, modelNames = modelNames,
                            str = str, cRes.all = Res.all, cRes.best = Res.best,
                            cRes.print = Res.print, x = x, nlsInits = nlsInits,
                            nRes = nRes) 
      ## Apply 'findrelation' function
      Res.all = frfit[[1]]
      Res.best = frfit[[2]]
      Res.print = frfit[[3]]
    }

    tofit.flag <- tofit.flag | (!(Res.best[-1, iResp] %in% c(NA, -1)))
    fitted.flag[iResp - 1] <- TRUE
  }

  ## Outputs are the three return objects
  res <- list(data = x, exogenous = exogenous, endogenous = endogenous,
              table = Res.print, bestModels = Res.best, allModels = Res.all)
  class(res) <- c("netSEMp1","list")
  invisible(res)
}