R/sim_Y_MVN_X.R

Defines functions sim_Y_MVN_X

Documented in sim_Y_MVN_X

#' Simulate Scalar Outcomes from Simulated Spatially Correlated Predictors
#'
#' N spatially correlated design vectors are simulated from an MVN. These
#' design vectors are used to then simulate scalar outcomes that have
#' one of Gaussian, Binomial, Multinomial or Poisson distributions.
#' @inheritParams sim_MVN_X 
#' @param B A vector parameter values; i.e. "betas". Note that 
#' \code{length(B)} must equal \code{p + 1 = n.row * n.col + 1}; e.g. for 
#' normal outcomes \eqn{Y = XB + e} with \code{Y} a scalar outcome and 
#' \code{e} the random error. Note that when \code{dist = "multinomial"} then
#' \code{B} should be a list with length equal to \code{V - 1}, i.e., should
#' contain parameter values associated with all categories except the 
#' reference category. Alternatively, when \code{dist = "multinomial"}  
#' \code{B} may be a list of length \code{V} if one desires to specify 
#' parameters for every category, i.e., the over-parameterized model used in 
#' Friedman (2010). 
#' @param rand.err A vector for the random error standard deviation when 
#' \code{dist = "gaussian"}, or thresholding is used to obtain non-Normal 
#' draws. Must have length 1 or length N.
#' @param dist The distribution of the scalar outcome.
#' \itemize{
#'     \item \code{dist = "gaussian"} has \eqn{Y = XB + e}, where
#'     \eqn{e ~ N(0, rand.err)}.
#'     \item \code{dist = "binomial"}: Y is drawn from a binomial distribution
#'     with probability of "success" equal to \eqn{1 / (1 + 1 / exp(XB))} 
#'     using \code{rbinom()} when \code{binary.method = "traditional"}. If 
#'     \code{binary.method = "gaussian"}, then simulation  is based on a 
#'     cutoff using \code{binary.cutoff}.
#'     \item \code{dist = "multinomial"}: Y is drawn from \code{sample()} 
#'     using probabilities generated based on Chapter 6.1.3 of Agresti (2007)
#'     when \code{length(B) = V - 1} or Friedman (2010) when the 
#'     \code{length(B) = V}. Threshold-based approaches are not currently 
#'     supported.
#'     \item \code{dist = "poisson"}: Y is drawn from \eqn{Poisson(exp(XB))}
#'     using \code{rpois()}.
#' }
#' @param V A numeric value stating the number of categories desired when 
#' \code{dist = "multinomial"}.
#' @param threshold.method One of \code{"none", "manual", "percentile", "round"}.
#' When \code{"none"} draws from Binomial or Poisson distributions are taken 
#' subject-wise using base \code{R} functions. For the remaining options, 
#' draws are first taken from a Normal distribution and then thresholded. 
#' \code{"manual"} uses \code{Y.thresh} to manually select a cutoff, 
#' \code{"percentile"} uses \code{Y.thresh} to select percentiles used to 
#' bin outcomes, and \code{"round"} sets values equal or less than 0 to 0, 
#' and rounds all positive values to the nearest whole number.
#' @param Y.thresh A manual value used to threshold when 
#' \code{threshold.method = "manual"}; values equal or greater than the cutoff
#' are assigned 1 and all others 0. When \code{threshold.method = "percentile"},
#' a percentile to use to bin outcomes.
#' @param incl.subjectID When \code{incl.subjectID = TRUE} a column of subject
#' indices is generated.
#' @param print.out If \code{print.out = TRUE} then print the following for
#'  each subject, indexed y: \itemize{
#'      \item \code{X[y] \%*\% B}
#'      \item \code{p[y]}, \code{lambda[y]} for Binomial, Poisson, respectively.
#'  }
#' This is useful to see the effect of image parameter selection and beta
#' parameter selection on distributional parameters for the outcome of 
#' interest.
#' @note Careful parameter selection, i.e. \code{B}, is necessary to ensure 
#' that simulated outcomes are reasonable; in particular, counts arising from
#' the Poisson distribution can be unnaturally large.
#' @examples
#' ## generate precision matrix and take Cholesky decomposition
#' Rpre <- chol_s2Dp(im.res = c(3, 3), matrix.type = "prec",
#'                   use.spam = TRUE, neighborhood = "ar1",
#'                   triangle = "upper", return.prec = TRUE)
#' ## Generate correlation matrix & take Cholesky decomposition
#' Rcov <- chol_s2Dp(corr.structure = "ar1", im.res = c(3, 3), 
#'                   rho = 0.5,
#'                   triangle = "upper",
#'                   use.spam = FALSE, neighborhood = "none")
#'
#' ## Define non-zero beta values
#' Bex <- beta_builder(row.index = c(2, 3), 
#'                     col.index = c(3, 3),
#'                     im.res = c(3, 3),
#'                     B0 = 0, B.values = rep(1, 2),
#'                     output.indices = FALSE)
#' ## Simulate Datasets
#' ## parameter values
#' Nex = 100
#' set.seed(28743)
#'
#' ## with precision matrix
#' Gauss.exp <- sim_Y_MVN_X(N = Nex, B = Bex,
#'                          R = Rpre$R, Q = Rpre$Q,
#'                          dist = "gaussian")
#' hist(Gauss.exp$Y)
#'
#' ## with covariance matrix
#' Gauss.exc <- sim_Y_MVN_X(N = Nex, B = Bex,
#'                          R = Rcov$R, S = Rcov$S,
#'                          dist = "gaussian")
#' hist(Gauss.exc$Y)
#'
#' ## direct draws from binomial
#' Bin.ex <- sim_Y_MVN_X(N = Nex, B = Bex, R = Rcov$R, S = Rcov$S,
#'                       dist = "binomial", print.out = TRUE)
#' table(Bin.ex$Y)
#'
#' ## manual cutoff
#' Bin.ex2 <- sim_Y_MVN_X(N = Nex, B = Bex,
#'                        R = Rcov$R, S = Rcov$S,
#'                        dist = "binomial",
#'                        threshold.method = "manual",
#'                        Y.thresh = 1.25)
#' table(Bin.ex2$Y)
#'
#' ## percentile cutoff
#' Bin.ex3 <- sim_Y_MVN_X(N = Nex, B = Bex,
#'                        R = Rcov$R, S = Rcov$S,
#'                        dist = "binomial",
#'                        threshold.method = "percentile",
#'                        Y.thresh = 0.75)
#' table(Bin.ex3$Y)
#'
#' ## Poisson Example - note the large counts
#' Pois.ex <- sim_Y_MVN_X(N = Nex, B = Bex,
#'                        R = Rcov$R, S = Rcov$S,
#'                        dist = "poisson", print.out = TRUE)
#' mean(Pois.ex$Y)
#' quantile(Pois.ex$Y, 
#'          probs = c(0, 0.1, 0.25, 0.45, 0.5,
#'                    0.75, 0.9, 0.95, 0.99, 1))
#' hist(Pois.ex$Y)
#' @return A data frame where each row consists of a single subject's data.
#' Col 1 is the outcome, Y, and each successive column contains the subject
#' predictor values.
#' @importFrom stats rnorm rbinom quantile rpois
#' @importFrom Rdpack reprompt
#' @references
#' \insertRef{spam}{sim2Dpredictr}
#'
#' \insertRef{Ripley:1987}{sim2Dpredictr}
#'
#' \insertRef{Rue:2001}{sim2Dpredictr}
#' 
#' \insertRef{Agresti:2007}{sim2Dpredictr}
#' 
#' \insertRef{Friedman:2010}{sim2Dpredictr}
#' @export
sim_Y_MVN_X <- function(N, B, L = NULL, R = NULL,
                       S = NULL, Q = NULL, use.spam = TRUE,
                       mu = 0, rand.err = 1,
                       dist = "gaussian", V = NULL,
                       incl.subjectID = TRUE,
                       threshold.method = "none",
                       Y.thresh = NULL,
                       X.categorical = FALSE, X.num.categories = 2,
                       X.category.type = "percentile", 
                       X.manual.thresh = NULL,
                       X.cat.names = NULL, print.out = FALSE){
  n <- NULL

  if ( X.categorical == TRUE & X.num.categories > 2) {
    stop("Current version of sim_Y_MVN_X requires X.num.categories <= 2.")
  }

  if (threshold.method == "round" & dist != "poisson") {
    stop("Rounding is only appropriate for count outcomes.")
  }

  # Predictable errors and warnings...
  if ( (is.null(R) == TRUE) & (is.null(L) == TRUE) ){
    stop("Function requires either specification of L or R.")
  }
  if ( (is.null(R) == FALSE) & (is.null(L) == FALSE) ){
    warning("Specify either L or R, not both. Function proceeding with L.")
    R <- NULL
  }

  # Obtain number of parameters.
  LRSQ <- list(L = L, R = R, S = S, Q = Q)
  LRSQw <- LRSQ[which(lapply(LRSQ, is.null) == FALSE)]
  LRSQw1 <- LRSQw[1]
  p <- dim(LRSQw[[1]])[1]

  # build and/or verify mu is acceptable
  if (length(mu) == 1) {
    mu <- rep(mu, p)
  } else {
    if (length(mu) != p) {cat("Invalid mean vector length.", "\n")}
  }
  
  if (dist == "multinomial") {
    for (v in 1:length(B)) {
      if (length(B[[v]]) != p + 1){stop("Each B must have length 1 + nrow(L)")}
    }
  } else {
    if (length(B) != p + 1){stop("B must have length 1 + nrow(L)")}
  }
  
  out.names <- c("Y", "X0")
  for (i in 3:(p + 2)){
    out.names[i] <- paste0("X", i - 2)
  }

  # generate predictors and outcome
  X0 <- rep(1, N)
  X <- sim_MVN_X(N = N, mu = mu, L = L, R = R,
                 S = S, Q = Q, use.spam = use.spam,
                 X.categorical = X.categorical,
                 X.num.categories = X.num.categories,
                 X.category.type = X.category.type,
                 X.manual.thresh = X.manual.thresh,
                 X.cat.names = X.cat.names)
  Xn <- cbind(X0, X)

  if ( (dist == "gaussian") |
       (dist %in% c("binomial", "poisson") & threshold.method != "none")) {

    if (!(length(rand.err) %in% c(1, N))){
      stop("Length of rand.err must be 1 or N")
    } else {
      En <-stats:: rnorm(N, 0, rand.err)
      Y <- Xn %*% B + En
      if (print.out == TRUE) {
        print(Xn %*% B)
      }
    }
  }

  if (dist == "binomial"){

    if (threshold.method == "none" & is.null(Y.thresh) == FALSE ) {
      warning(paste0(
        "A cutoff value is not applicable when threshold.method = ", 
        threshold.method))
    }

    if (threshold.method != "none" & is.null(Y.thresh) == TRUE ) {
      if (threshold.method == "percentile") {
        warning(paste0("threshold.method = ",
                       threshold.method,
                       " requires user specified cutoff value. Defaulting to the median."))
      }
      if (threshold.method == "manual") {
        warning(paste0("threshold.method = ",
                       threshold.method,
                       " requires user specified cutoff value. Defaulting to 0."))
      }
    }

    if (threshold.method == "none") {
      Y <- c()
      XB <- Xn %*% B
      # for logistic regression GLM must use either:
      # p = exp(XB[y]) / (1 + exp(XB[y]))
      # p = 1 / (1 + 1 / exp(XB[y]))
      for (y in 1:N) {
        if (print.out == TRUE) {
          cat("Subject ", y, " has XB = ", XB[y],
              " and prob of 'success' =", 1 / (1 + 1 / exp(XB[y])), "\n")
        }
        Y[y] <- stats::rbinom(n = 1, size = 1, prob = 1 / (1 + 1 / exp(XB[y])))
      }
    } else {

      # prepare for thresholding
      Y0 <- Y
      Y <- c()
      # Manual Cutoff
      if (threshold.method == "manual") {

        # if no user specified value, the threshold is 0.
        if (is.null(Y.thresh) == TRUE) {
          Y.thresh <- 0
        }

        Y[Y0 > Y.thresh] <- 1
        Y[Y0 <= Y.thresh] <- 0
      }

      # Percentile Cutoff
      if (threshold.method == "percentile") {

        # If no user specified value, the threshold is the median.
        if (is.null(Y.thresh) == TRUE) {
          Y.thresh <- 0.50
        }

        if ( ( Y.thresh < 0) | (Y.thresh > 1) ) {
          stop(paste0(
            "Percentiles must be between 0 and 1, but Y.thresh = ", 
            Y.thresh))
        } else {
          perc.ct <- quantile(x = Y0, probs = Y.thresh, type = 3)
          cat("The ", 100 * Y.thresh, 
              "th Percentile (threshold) is ",
              perc.ct, ".", "\n")
          Y[Y0 > perc.ct] <- 1
          Y[Y0 <= perc.ct] <- 0
        }
      }
    }
  }
  
  if (dist == "multinomial") {
    Y <- c()
    
    # generate subject-specific probabilities
    p.mn <- generate_multinom_probs(V = V, B = B, X = Xn, X.incl.X0 = TRUE)
    
    Y <- c()
    for (i in 1:nrow(p.mn)) {
      Y[i] <-  sample(x = 1:V, size = 1, prob = p.mn[i, ])
    }
  }

  if (dist == "poisson"){

    if (threshold.method == "none") {
      # For Poisson GLM lambda = exp(XB[y])
      Y <- c()
      XB <- Xn %*% B
      for (y in 1:N) {
        if (print.out == TRUE) {
          cat("Subject ", y, " has XB =", XB[y],
              " and lambda =", exp(XB[y]), "\n")
        }
        Y[y] <- stats::rpois(n = 1, lambda = exp(XB[y]))
      }
    } else if (threshold.method == "rounding") {
      Y[Y <= 0] <- 0
      Y <- round(Y)
    }
  }

  # create final dataset
  data.1 <- data.frame(Y, Xn)
  colnames(data.1) <- out.names
  if (incl.subjectID == TRUE) {
    data.1$subjectID <- 1:N
  }
  return(data.1[ , -2])
}
jmleach-bst/sim2Dpredictr documentation built on March 4, 2024, 5:57 p.m.