R/Sim.R

## --- SIMULATE DATA SET ---

## --- Set parameters for one model

#' Set simulated model parameters
#' 
#' Set parameters for one model.
#'
#' @param mean_fun Mean function of model.
#' @param err_dist Error distribution of model.
#' @param Dat Data frame containg x and y to estimate tail-adjusted beta "delta" parameter.
#' @param thetaM Vector of parameters values for mean function.
#' @param thetaE Vector of parameters values for error distribution.
#' @param thetaC Vector of parameters values for constants.
#' 
#' @return List containing parameters to simulated data set.
#' 
#' @keywords simulation parameters
#'
#' @examples
#'
#' ## Sim simulation parameters for gaussian-negbin model
#' set_sim_par(mean_fun="gaussian", err_dist="negbin",
#'             thetaM=c(H=60,m=50,s=10), thetaE=c(phi=1.5))
#' ## Sim simulation parameters for binomial model
#' set_sim_par(mean_fun="gaussian", err_dist="binomial.count",
#'             thetaM=c(H=60,m=50,s=10), thetaC=c(binomial_n=100))
#' 
#' @export
#'
set_sim_par <- function (mean_fun=NULL, err_dist=NULL, Dat=NULL,
                         thetaM=NULL, thetaE=NULL, thetaC=NULL) {
  ## --- Set simulated model parameters
  
  ## --- Stop if error distribution and error class are not specified
  if ( is.null(err_dist) | is.null(mean_fun) ) { stop ("Must specify err_dist and mean_fun!")}

  ## --- Check mean functions and error distributions
  check_mean_functions_and_error_distributions (mean_fun, err_dist)
  
  ## --- Only one mean function, and error distribution allowed
  if (length(mean_fun) > 1) { stop ("Only one mean function allowed!") }
  if (length(err_dist) > 1) { stop ("Only one error distribution allowed!") }
  
  ## --- Set error class
  err_class <- error_distributions (err_dist=err_dist)
  
  ## --- Set model information
  ModelInfo <- set_model_info (mean_fun=mean_fun, err_dist=err_dist, thetaC=thetaC)
  
  ## --- Initialise object
  Par <- list()

  ## --- Model name
  Par$model_name <- ModelInfo$model_name 
  
  ## --- Constants defined via thetaC or err_dist
  thetaC <- estimate_constant (ModelInfo, data=NULL)
  
  ## --- Get default parameters
  ParDefault <- get_default_par (ModelInfo)
  
  ## --- Mean function
  Par$mean_fun <- mean_fun

  ## --- Data model 
  Par$err_dist <- err_dist
  
  ## --- Error class
  Par$err_class <- err_class
  
  ## --- Use default parameters if none others supplied
  if (is.null(thetaM)) { thetaM <- ParDefault$thetaM }
  if (is.null(thetaE)) { thetaE <- ParDefault$thetaE }
  if (is.null(thetaC)) { thetaC <- ParDefault$thetaC }
  
  ## --- Store parameters
  Par$thetaM <- thetaM       ## Mean parameters
  Par$thetaE <- thetaE       ## Error parameters
  Par$thetaC <- thetaC       ## "Constants" parameters
  
  ## --- Combine parameters
  Par$theta <- c(thetaM, thetaE, thetaC)
  
  ## --- Check if parameter values are within bounds
  parslegal <- check_par_values (ModelInfo=ModelInfo, theta=Par$theta)
  if (!parslegal) { stop ("Parameter values are out of bounds!") }
  
  ## --- Set class
  class (Par) <- "senlm_par"
  
  ## --- Return parameter object
  return (Par)
}


get_default_par <- function (ModelInfo) {
  ## --- Get default parameter values for simulated data
  
  ## --- Grab model name, error distribution, and mean function
  model_name <- ModelInfo$model_name
  err_dist <- ModelInfo$err_dist
  mean_fun <- ModelInfo$mean_fun
  
  ## --- Grab constants
  thetaC <- estimate_constant (ModelInfo, data=NULL)

  ## --- Error distributions
  if (err_dist == "bernoulli")         { thetaE <- NULL }
  if (err_dist == "binomial.count")    { thetaE <- NULL }
  if (err_dist == "binomial.prop")     { thetaE <- NULL }
  if (err_dist == "poisson")           { thetaE <- NULL }
  if (err_dist == "negbin")            { thetaE <- c(phi=1.5) }
  if (err_dist == "zip")               { thetaE <- c(pi=0.3) }
  if (err_dist == "zinb")              { thetaE <- c(pi=0.3, phi=1.5) }
  if (err_dist == "zipl")              { thetaE <- c(g0=-0.9, g1=-0.5) }
  if (err_dist == "zipl.mu")           { thetaE <- c(g0=-0.9, g1=-0.05) }
  if (err_dist == "zinbl")             { thetaE <- c(g0=-0.9, g1=-0.5,  phi=1.5) }
  if (err_dist == "zinbl.mu")          { thetaE <- c(g0=-0.9, g1=-0.05, phi=1.5) }
  if (err_dist == "gaussian")          { thetaE <- c(sigma=5) }
  if (err_dist == "tweedie")           { thetaE <- c(phi=1.5, rho=1.1) }
  if (err_dist == "zig")               { thetaE <- c(pi=0.3, phi=0.5) }
  if (err_dist == "zigl")              { thetaE <- c(g0=-0.9, g1=-0.5, phi=0.5) }
  if (err_dist == "zigl.mu")           { thetaE <- c(g0=-0.9, g1=-0.5, phi=0.5) }
  if (err_dist == "ziig")              { thetaE <- c(pi=0.3, phi=0.2) }
  if (err_dist == "ziigl")             { thetaE <- c(g0=-0.9, g1=-0.5, phi=0.2) }
  if (err_dist == "ziigl.mu")          { thetaE <- c(g0=-0.9, g1=-0.5, phi=0.2) }
  if (err_dist == "tab")               { thetaE <- c(sigma=0.1) }
  if (err_dist == "zitab")             { thetaE <- c(pi=0.3, sigma=0.1) }
  
  ## --- Mean functions
  if (mean_fun == "constant")          { thetaM <- c(H=60) }
  if (mean_fun == "uniform")           { thetaM <- c(H=60, c=30, d=70) }
  if (mean_fun == "gaussian")          { thetaM <- c(H=60, m=50, s=10) }
  if (mean_fun == "mixgaussian")       { thetaM <- c(H=60, a=0.7, m1=35, m2=65, s1=10, s2=5) }
  if (mean_fun == "mixgaussian.equal") { thetaM <- c(H=60, a=0.7, m1=35, m2=65, s=5) }
  if (mean_fun == "beta")              { thetaM <- c(H=60, c=30, d=70, u=0.4, v=0.2) }
  if (mean_fun == "sech")              { thetaM <- c(H=60, m=50, s=5, r=0.75, p=1.5) }
  if (mean_fun == "sech.p1")           { thetaM <- c(H=60, m=50, s=5, r=0.75) }
  if (mean_fun == "sech.r0p1")         { thetaM <- c(H=60, m=50, s=5) }
  if (mean_fun == "modskurt")          { thetaM <- c(H=60, m=50, s=5, q=0.6, p=1.5, b=0.5) }
  if (mean_fun == "hofII")             { thetaM <- c(H=60, m=50, w0=-0.3) }
  if (mean_fun == "hofIV")             { thetaM <- c(H=60, m=50, w=0.3, k=10) }
  if (mean_fun == "hofIVb")            { thetaM <- c(H=60, m=50, w=0.3) }
  if (mean_fun == "hofV")              { thetaM <- c(H=60, m=50, w1=0.2, w2=0.4, k=10) }
  if (mean_fun == "hofVb")             { thetaM <- c(H=60, m=50, w1=0.2, w2=0.4, k=10) }
  
  ## --- Constant
  if (is.null(thetaC)) {
    ## binomial_n
    if ( (err_dist == "binomia.count") | (err_dist == "binomial.prop") ) {
      thetaC <- c(binomial_n=40)
    }
    ## delta
    if ( (err_dist == "tab") | (err_dist == "zitab") ) {
      thetaC <- c(delta=0.01)
    }
  }

  ## --- Rescale H for bernoulli/percentage models
  if ( (err_dist == "bernoulli") | (err_dist == "binomial.prop") |
       (err_dist == "tab")       | (err_dist == "zitab")         ) {
    if (any(names(thetaM) == "H"))  { thetaM["H"]  <- 0.9 }
  }
  
  ## --- Binomial count
  if (err_dist == "binomial.count") {
    ## Make H 0.9 times binomial n parameter
    if (any(names(thetaM) == "H"))  { thetaM["H"] <- round(0.9*thetaC["binomial_n"]) }
  }
  
  ## --- Set error class
  err_class <- error_distributions (err_dist=err_dist)
  
  ## --- Store parameter names
  Par <- list ()
  Par$model_name <- model_name
  Par$err_class <- err_class
  Par$thetaM <- thetaM
  Par$thetaE <- thetaE
  Par$thetaC <- thetaC
  
  ## --- Return parameter names
  return (Par)
}


#' Create default parameter list
#' 
#' Create parameter object using default values.
#' 
#' @param models Models defined by set_models() object.
#' @param mean_fun Mean function.
#' @param mean_class String that indicates the class of mean functions to list.
#' @param err_dist Error distribution.
#' @param err_class String that indicates the class of error distributions to list.
#' @param binomial_n Value of binomial n parameter for binomial models.
#' @param method How mean function and error distributions should be combined;
#' "paired" or "crossed".
#'
#' @return List of parameter simulation objects.
#' 
#' @keywords Simulation parameters
#'
#' @examples
#'
#' ## Create default parameters from set_models() object
#' Models <- set_models (mean_fun=c("gaussian", "beta", "sech", "modskurt"),
#'                       err_dist=c("poisson","zip"), method="crossed")
#' Pars <- create_default_par_list (Models)
#'
#' ## Create parameter object by pairing mean function and error distribution
#' Pars <- create_default_par_list (method="paired",
#'                                  mean_fun=c("gaussian", "beta"), 
#'                                  err_dist=c("zip", "zinb"))
#'
#' ## Create parameter object by crossing mean function and error distribution
#' Pars <- create_default_par_list (method="crossed",
#'                                  mean_fun=c("gaussian", "beta"), 
#'                                  err_dist=c("poisson", "zip", "zinb"))
#' 
#' @export
#' 
create_default_par_list <- function (models=NULL,
                                     mean_fun=NULL, mean_class=NULL,
                                     err_dist=NULL, err_class=NULL,
                                     binomial_n=NA,
                                     method="paired") {
  ## --- Create default model parameter object
  
  if (!is.null(models)) {
    if (any(class(models)=="model_list")) {
      ## --- Parameters are specified by a set_models() object
      
      ## --- Grab mean functions, error distributions, and binomial_n
      mf <- as.character(models$mean_fun)          
      ed <- as.character(models$err_dist)
      bn <- models$binomial_n
    } else {
      stop ("Models argument should be specified with set_models()!" )
    }
  } else {
    ## --- Mean functions, error distributions, and binomial_n are
    ##     supplied separately in vectors
    
    ## --- Stop if mean function and mean class are both specified
    if (!is.null(mean_fun) & !is.null(mean_class)) {
      stop ("Do not specify mean_fun **and** mean_class!")
    }
    ## --- Stop if error distribution and error class both specified
    if (!is.null(err_dist) & !is.null(err_class)) {
      stop ("Do not specify err_dist **and** err_class!")
    }
    
    ## --- If error distribution and error class not specifed set error class to "all"
    if ( is.null(err_dist) & is.null(err_class)) { err_class <- "all"; method <- "crossed" }
    ## --- Set error distribution if error class supplied
    if (!is.null(err_class)) { 
      err_dist <- error_distributions (err_class=err_class)
    }
    
    ## --- If mean function and mean class not specifed set class to "all"
    if ( is.null(mean_fun) & is.null(mean_class) ) { mean_class <- "all";  method <- "crossed" }
    ## --- Set mean function if class supplied
    if (!is.null(mean_class)) { 
      mean_fun <- mean_functions (mean_class=mean_class)
    }
    
    ## --- Set mean functions to all if not set  
    if (is.null(mean_fun)) { mean_fun <- mean_functions(mean_class="all") }
    
    ## --- Check mean functions and error distributions
    check_mean_functions_and_error_distributions (mean_fun, err_dist)
    
    ## --- Remove NA from binomial_n, unless all are NA then set to singular NA 
    ##     (binomial_n must be NA, an integer, or a vector of integers each each binomial model)
    if (all(is.na(binomial_n))) {
      binomial_n <- NA
    } else {
      binomial_n <- binomial_n[!is.na(binomial_n)]
    }
    
    ## --- Create data frame of models
    if (is.na(binomial_n)) {
      ## Reset binomial n to NA (it will be set by set_sim_par)
      ## Set temporarily to 1 to avoid error from input check
      DF <- set_models (mean_fun=mean_fun, err_dist=err_dist, binomial_n=1, method=method)
      DF$binomial_n <- NA
    } else {
      ## Use supplied binomial_n
      DF <- set_models (mean_fun=mean_fun, err_dist=err_dist, binomial_n=binomial_n, method=method)
    }
    
    ## --- Grab mean functions and error distributions
    mf <- as.character(DF$mean_fun)          
    ed <- as.character(DF$err_dist)
    bn <- DF$binomial_n
  }
  
  ## --- Initalise parameter list
  Pars <- vector (mode="list", length=length(mf))

  ## --- Create default parameter objects
  for (i in 1:length(Pars)) {

    ## Set binomial n parameter
    if (is.na(bn[i])) {
      ## Use default n
      thetaC <- NULL
    } else {
      ## Use supplied n
      thetaC <- c(binomial_n=bn[i])
    }
    
    ## Create default parameter object
    Pars[[i]] <- set_sim_par (mean_fun=mf[i], err_dist=ed[i], thetaC=thetaC)
  }
  
  ## --- Return parameter objects
  return (Pars)
}


simulate_data <- function (x, Par) {
  ## --- Simulate data
  
  ## --- Grab error distribution
  err_dist <- Par$err_dist
  
  ## Create data object
  Dat <- list()

  ## Calculate mean function
  mu <- do.call (paste("mu_", Par$mean_fun, sep=""), list(Par$thetaM, x))
  
  ## --- SIMULATE DATA

  ## --- Bernoulli
  if (err_dist == "bernoulli" ) {
    ## --- Bernoulli (0/1 Prescence/Abscence data).
    y <- as.numeric(stats::runif (n=length(x)) <= mu)
  }
  
  ## --- Binomial - count
  if (err_dist == "binomial.count" ) {
    n <- as.list(Par$thetaC)$binomial_n
    p <- mu/n
    y <- stats::rbinom (n=length(x), prob=p, size=n)
  }
  
  ## --- Binomial - prop
  if (err_dist == "binomial.prop" ) {
    n <- as.list(Par$thetaC)$binomial_n
    p <- mu
    y <- (stats::rbinom (n=length(x), prob=p, size=n)) / n
  }
  
  ## --- Poisson
  if (err_dist == "poisson" ) {
    y <- stats::rpois (n=length(x), lambda=mu)
  }
  
  ## --- Negative binomial (Gamma-Poisson)
  if (err_dist == "negbin" ) {
    ## --- Generate a Gamma (shape=alpha, rate=beta) [mean=alpha/beta]
    ## --- then a Poisson with the mean given by the Gamma rv.
    ## --- Mean of the NegBin is mu = alpha/beta,
    ## --- and variance is sigam = mu + phi*mu^2.
    ## --- mu is given by the mean function eta.

    ## If phi=0 then data=poisson
    if (as.list(Par$thetaE)$phi == 0) {
      poisson.means <- mu
    } else {
      ## Convert mean and dispersion parameter to shape parameters of Gamma distribution
      alpha <- 1/as.list(Par$thetaE)$phi
      beta  <- alpha/mu
      
      ## Simulate data
      poisson.means <- stats::rgamma(n=length(x), shape=alpha, rate=beta)
    }
    
    ## Sample Poisson given lambda
    y <- rep(0, length(x))
    for (i in 1:length(x)) {
      y[i] <- stats::rpois(1, lambda=poisson.means[i])
    }
  }

  ## --- Zero-inflated Poisson
  if (err_dist == "zip" ) {

    ## Generate Poisson data
    y <- stats::rpois (n=length(x), lambda=mu)
    
    ## Randomly change pi proportion to zero (extra zeros)
    y[stats::runif(length(x)) < as.list(Par$thetaE)$pi] <- 0
  }
    
  ## --- Zero-inflated negative binomial
  if (err_dist == "zinb" ) {
    
    ## Convert mean and dispersion parameter to shape parameters of Gamma distribution
    alpha <- 1/as.list(Par$thetaE)$phi
    beta  <- alpha/mu
    
    ## Generate poisson means
    poisson.means <- stats::rgamma(n=length(x), shape=alpha, rate=beta)
    
    ## Sample Poisson given lambda
    y <- rep(0, length(x))
    for (i in 1:length(x)) {
      y[i] <- stats::rpois(1, lambda=poisson.means[i])
    }
 
    ## Randomly change pi proportion to zero (extra zeros)
    y[stats::runif(length(x)) < as.list(Par$thetaE)$pi] <- 0
  }
  
  ## --- Zero-linked Poisson
  if ( (err_dist == "zipl") | (err_dist == "zipl.mu") ) {

    ## Grab parameters
    g0 <- as.list(Par$thetaE)$g0
    g1 <- as.list(Par$thetaE)$g1
    NonZero <- (mu > 0)

    ## Create y vector
    y <- rep (NA, length(mu))
    ## y is zero if mean is zero
    y[!NonZero] <- 0

    ## Grab non-zero means
    mu1 <- mu[NonZero]

    ## Create spike parameter
    if (err_dist == "zipl") {
      logitpi <- g0 + g1*log(mu1)
    }
    if (err_dist == "zipl.mu") {
      logitpi <- g0 + g1*mu1
    }
    pi <- exp(logitpi) / ( 1 + exp(logitpi) )
 
    ## Generate Poisson data
    y[NonZero] <- stats::rpois (n=sum(NonZero), lambda=mu1)
    
    ## Randomly change pi proportion to zero (extra zeros)
    y[stats::runif(sum(NonZero)) < pi] <- 0
  }

  ## --- Zero-linked negative binomial
  if ( (err_dist == "zinbl") | (err_dist == "zinbl.mu") ) {
    
    ## Grab parameters
    g0  <- as.list(Par$thetaE)$g0
    g1  <- as.list(Par$thetaE)$g1
    phi <- as.list(Par$thetaE)$phi
    NonZero <- (mu > 0)

    ## Convert mean and dispersion parameter to shape parameters of Gamma distribution
    alpha <- 1/phi
    beta  <- alpha/mu

    ## Generate poisson means
    poisson.means <- stats::rgamma(n=length(x), shape=alpha, rate=beta)

    ## Create y vector
    y <- rep (NA, length(poisson.means))
    ## y is zero if mean is zero
    y[!NonZero] <- 0

    ## Grab non-zero means
    mu1 <- poisson.means[NonZero]

    ## Create spike parameter
    if (err_dist == "zinbl") {
      logitpi <- g0 + g1*log(mu1)
    }
    if (err_dist == "zinbl.mu") {
      logitpi <- g0 + g1*mu1
    }
    pi <- exp(logitpi) / ( 1 + exp(logitpi) )
    
    ## Generate Poisson data
    y[NonZero] <- stats::rpois (n=sum(NonZero), lambda=mu1)
    
    ## Randomly change pi proportion to zero (extra zeros)
    y[stats::runif(sum(NonZero)) < pi] <- 0
  }

  ## --- Tail-adjusted beta error and Zero-inflated tail-adjusted beta
  if ( (err_dist == "tab" ) | (err_dist == "zitab" ) ) {

    ## Grab parameters
    delta <- as.list(Par$thetaC)$delta
    sigma <- as.list(Par$thetaE)$sigma

    ## Set beta distribution parameters
    Alpha <- mu / sigma
    Beta  <- (1 - mu) / sigma

    ## Generate beta data
    y <- stats::rbeta (n=length(x), shape1=Alpha, shape2=Beta)

    ## Remove values greater than 1 or less than 0
    y[y<0] <- 0;  y[y>1] <- 1

    ## If mean is 0 or 1, set y to 0 or 1 respectively
    y[mu==0] <- 0 ; y[mu==1] <- 1

    ## Set values < delta to 0 and > (1-delta) to 1
    y[y < delta] <- 0  ; y[y > (1-delta)] <- 1
  }
  
  ## --- Zero-inflated tail-adjusted beta
  if (err_dist == "zitab") {
    
    ## Grab parameters
    pi <- as.list(Par$thetaE)$pi
    
    ## Randomly change pi proportion to zero (extra zeros)
    y[stats::runif(length(x)) < pi] <- 0
  } 
  
  ## --- Gaussian error (data non-negative, zero when mean zero)
  if (err_dist == "gaussian" ) {
    y <- stats::rnorm (n=length(x), mean=mu, sd=as.list(Par$thetaE)$sigma)
    y[y<0] <- 0
    y[mu==0] <- 0
  }
  
  ## --- Tweedie error (data non-negative, zero when mean zero)
  if (err_dist == "tweedie" ) {
    ## --- Find where mean is zero
    Index <- (mu > 0)
    ## Ignore observations where mean is zero
    Mu    <- mu[Index]
    y     <- rep (0, length(x))
    ## Simulate data
    y[Index] <- tweedie::rtweedie (n=sum(Index), power=as.list(Par$thetaE)$rho,
                                   mu=Mu, phi=as.list(Par$thetaE)$phi)
    ## Make sure observation are zero when mean is zero
    y[y < 0] <- 0
    y[mu==0] <- 0
  }
  
  ## --- Zero-inflated gamma error (data non-negative, zero when mean zero)
  if (err_dist == "zig" ) {

    ## Grab parameters
    pi  <- as.list(Par$thetaE)$pi
    phi <- as.list(Par$thetaE)$phi

    ## --- Find where mean is positive
    NonZero <- (mu > 0)

    ## Ignore observations where mean is zero
    Mu    <- mu[NonZero]
    y     <- rep (0, length(x))
    
    ## If phi=0 then data=mean
    if (phi == 0) {
      y[NonZero] <- Mu
    } else {
      ## Convert mean and dispersion parameter to shape parameters of Gamma distribution
      alpha <- 1/phi
      beta  <- alpha/Mu
      
      ## Simulate data
      y[NonZero] <- stats::rgamma(n=sum(NonZero), shape=alpha, rate=beta)
    }
    
    ## Make sure observation are zero when mean is zero
    y[y < 0] <- 0
    y[mu==0] <- 0

    ## Randomly change pi proportion to zero (extra zeros)
    y[stats::runif(length(x)) < pi] <- 0
  }
  
  
  ## --- Zero-linked gamma
  if ( (err_dist == "zigl") | (err_dist == "zigl.mu") ) {

    ## Grab parameters
    g0  <- as.list(Par$thetaE)$g0
    g1  <- as.list(Par$thetaE)$g1
    phi <- as.list(Par$thetaE)$phi

    ## --- Find where mean is positive
    NonZero <- (mu > 0)
    
    ## Ignore observations where mean is zero
    Mu    <- mu[NonZero]
    y     <- rep (0, length(x))

    ## If phi=0 then data=mean
    if (phi == 0) {
      y[NonZero] <- Mu
    } else {
      ## Convert mean and dispersion parameter to shape parameters of Gamma distribution
      alpha <- 1/phi
      beta  <- alpha/Mu
      
      ## Simulate data
      y[NonZero] <- stats::rgamma(n=sum(NonZero), shape=alpha, rate=beta)
    }
    
    ## Make sure observation are zero when mean is zero
    y[y < 0] <- 0
    y[!NonZero] <- 0

    ## Create spike parameter
    if (err_dist == "zigl")    { logitpi <- g0 + g1*log(Mu) }
    if (err_dist == "zigl.mu") { logitpi <- g0 + g1*Mu      }
    pi <- exp(logitpi) / ( 1 + exp(logitpi) )
    
    ## Randomly change pi proportion to zero (extra zeros)
    y[stats::runif(sum(NonZero)) < pi] <- 0
  }
  
  
  ## --- Zero-inflated inverse gaussian error (data non-negative, zero when mean zero)
  if (err_dist == "ziig" ) {
    
    ## Grab parameters
    pi  <- as.list(Par$thetaE)$pi
    phi <- as.list(Par$thetaE)$phi

    ## --- Find where mean is positive
    NonZero <- (mu > 0)
    
    ## Ignore observations where mean is zero
    Mu    <- mu[NonZero]
    y     <- rep (0, length(x))
    
    ## If phi=0 then data=mean
    if (phi == 0) {
      y[NonZero] <- Mu
    } else {
      ## Simulate data
      y[NonZero] <- rinversegaussian(n=sum(NonZero), mu=Mu, phi=phi)
    }
    
    ## Make sure observation are zero when mean is zero
    y[y < 0] <- 0
    y[mu==0] <- 0

    ## Randomly change pi proportion to zero (extra zeros)
    y[stats::runif(length(x)) < pi] <- 0
  }
  
  
  ## --- Zero-linked inverse gaussian
  if ( (err_dist == "ziigl") | (err_dist == "ziigl.mu") ) {
    
    ## Grab parameters
    g0  <- as.list(Par$thetaE)$g0
    g1  <- as.list(Par$thetaE)$g1
    phi <- as.list(Par$thetaE)$phi

    ## --- Find where mean is positive
    NonZero <- (mu > 0)
    
    ## Ignore observations where mean is zero
    Mu    <- mu[NonZero]
    y     <- rep (0, length(x))

    ## If phi=0 then data=mean
    if (phi == 0) {
      y[NonZero] <- Mu
    } else {      
      ## Simulate data
      y[NonZero] <- rinversegaussian(n=sum(NonZero), mu=Mu, phi=phi)
    }
    
    ## Make sure observation are zero when mean is zero
    y[y < 0] <- 0
    y[!NonZero] <- 0
    
    ## Create spike parameter
    if (err_dist == "ziigl")    { logitpi <- g0 + g1*log(Mu) }
    if (err_dist == "ziigl.mu") { logitpi <- g0 + g1*Mu      }
    pi <- exp(logitpi) / ( 1 + exp(logitpi) )
    
    ## Randomly change pi proportion to zero (extra zeros)
    y[stats::runif(sum(NonZero)) < pi] <- 0
  }
  
  ## Store data
  Dat$data_name  <- "Simulation"
  Dat$model_name <- Par$model_name
  Dat$err_dist   <- err_dist
  Dat$mean_fun   <- Par$mean_fun
  Dat$transform  <- "raw"
  Dat$N          <- length(x)
  Dat$y          <- y
  Dat$x          <- x
  Dat$mu         <- mu
  Dat$thetaE     <- Par$thetaE
  Dat$thetaM     <- Par$thetaM
  
  ## Return data object
  return (Dat)
}


#' Create list of simulated data sets
#' 
#' Simulate data sets defined by the given parameter list.
#'
#' @param Pars List containg parameters for the simulated data sets.
#' @param x Vector of x values to simulated data. If not given, N, xmin, and xmax must be).
#' @param N (optional) Sample size of data set to simulate. Default value 500.
#' @param xmin (optional) Minimum value of x variable. Default value 0.
#' @param xmax (optional) Maximum value of x variable. Default value 100.
#' @param seed Random number seed.
#' @param echo If TRUE display list describing data sets simulated
#' 
#' @return Data frame containing one x variable and simulated data sets from Pars object.
#' 
#' @keywords simulated data sets
#' 
#' @examples
#'
#'
#' ## Set model
#' Models <- set_models (mean_fun="gaussian", err_dist="negbin")
#'
#' ## Set list of default parameters for each model
#' Pars <- create_default_par_list (Models)
#'
#' ## Simulate data
#' Data <- create_simulated_datasets (Pars, seed=12345)
#' 
#' @export
#' 
create_simulated_datasets <- function (Pars, x=NULL, N=500, xmin=0, xmax=100,
                                       seed=NULL, echo=FALSE) {
  ## --- Create simulated data set object
  
  ## --- Define random number seed
  if ( is.null (seed) ) {
    seed <- as.numeric(paste(c(sample(1:9, 1), sample(0:9,(5-1))), collapse=""))
  }
  ## Set random seed
  set.seed (seed)
 
  ## Store given x values or simulate new ones
  if (is.null(x)) {
    ## --- Simulate x values
    
    ## --- Check if N, xmin, and xmax are univariate, numeric
    if (!((N[1]==round(N[1])) & (length(N)==1))) { stop ("N must be univariate integer/numeric!") }
    if (!(is.numeric(xmin) & (length(xmin)==1))) { stop ("xmin must be univariate numeric!") }
    if (!(is.numeric(xmax) & (length(xmax)==1))) { stop ("xmax must be univariate numeric!") }
    if (xmin > xmax) { stop ("xmin must be less than xmax!") }
    
    ## Create random x between xmin and xmax
    x <- sort(stats::runif(N, xmin, xmax))
    
  } else {
    ## --- A data frame of x values is given
    if (!is.vector(x))  { stop ("X must be a vector!") }
    N    <- length(x)
    xmin <- NULL
    xmax <- NULL
  }
  

  ## --- Create data sets

  ## --- Set data object
  Data <- vector (length=length(Pars), mode="list")
  
  ## Loop through parameter sets
  for (i in 1:length(Pars)) {

    ## Generate data
    Data[[i]] <- simulate_data (x=x, Par=Pars[[i]])

    ## Data name
    Data[[i]]$data_name <- paste("Simulation: (N=",Data[[i]]$N,
                                 ", ",Data[[i]]$model_name,")", sep="")
  } 
  
  ## Print data models
  if (echo) {
    Mat <- as.matrix(sapply (Data, '[[', "data_name"))
    row.names(Mat) <- paste (1:length(Data), ":", sep="")
    colnames(Mat)  <- "Data sets"
    print (Mat)
  }

  ## --- Convert data to data frame

  ## Initialise data frame
  df <- as.data.frame (matrix(NA, nrow=length(x), ncol=length(Pars)+1))

  ## Store x
  df[,1] <- x

  ## Create variable names
  Names <- rep ("", ncol(df))
  Names[1] <- "x"

  ## Store data and grab y variable names
  for (i in 1:length(Pars)) {
    ## Response variable
    df[,i+1] <- Data[[i]]$y
    ## Variabe name - convert "-" to "_"
    Names[i+1] <- gsub ("-", "_", Data[[i]]$model_name) ## *** This line should not be needed!
  }
  ## Add names to data frame
  names(df) <- Names
  
  ## Return data object
  return (df)
}


## --- SUMMARISE DATA SET ---

summarise_datasets <- function (Data) {
  ## --- Summarise data sets
  
  ## Initialise summary statistics
  Summary <- matrix (NA, nrow=length(Data), ncol=4)
  DataNames <- rep ("", length(Data))
  
  ## Calculate summary statistics
  for (i in 1:length(Data)) {
    ## Grab data name
    DataNames[i] <- Data[[i]]$data_name
    ## Sample size
    Summary[i,1] <- length (Data[[i]]$y)
    ## Mean
    Summary[i,2] <- mean (Data[[i]]$y, na.rm=TRUE)
    ## Standard deviaton
    Summary[i,3] <- stats::sd (Data[[i]]$y, na.rm=TRUE)
    ## Variance
    Summary[i,4] <- stats::var (Data[[i]]$y, na.rm=TRUE)
  }
  
  ## Add row and column names
  colnames (Summary) <- c("N", "mean", "sd", "var")
  rownames (Summary) <- DataNames
  
  ## Convert to data frame
  Summary <- as.data.frame (Summary)
  
  ## Return summary object
  return (Summary)
}


classify_dataset <- function (Dat) {
  ## --- Classify data set as;
  ## binary     : Prescence/absence {0,1}
  ## percentage : Proportions [0,1]  (includes binomial proportions
  ## count      : Discrete counts; 0,1,2,...
  ## abundance  : Non-negative continuous; >= 0
  ## real       : Unbounded continuous (no models for this at this time)
  ##
  ## Note: Data cannot be classified as binomial count data based on data alone.
  
  ## Grab y data
  y <- Dat$y
  
  ## Classify y variable
  if ( all ( (y==0) | (y==1) ) ) {
    ## Prescence/Abscence (Binary data - 0,1)
    err_class <- "binary"
  } else if ( (min(y) >= 0) & (max(y) <= 1) ) {
    ## Percentage
    err_class <- "percentage"
  } else if ( all (y==round(y,0)) & (min(y)>=0) ) {
    ## Count data (non-negative integer)
    err_class <- "count"
  } else if ( min(y)>=0 ) {
    ## Abundance data (non-negative real)
    err_class <- "abundance"
  } else {
    ## Real data (unbounded)
    err_class <- "real"
  }

  ## Return data classification
  return (err_class)
}


## --- SELECT DATA SETS ---

select_datasets <- function (Data, Index) {
  ## --- Set selected simulated data sets given by Index

  ## Store original data sets
  Data0 <- Data

  ## Create new list to store subsetted data sets
  Data <- vector (length=length(Index), mode="list")

  ## Loop through selected data sets
  for (k in 1:length(Index)) {
    ## Select kth data set
    Data[[k]] <- Data0[[Index[k]]]
  }
  
  ## Return selected data sets
  return (Data)
  
}

select_dataset <- function (Data, k) {
  ## --- Set one simulated data set

  ## Select kth data set
  Dat <- Data[[k]]
  
  ## Return data set
  return (Dat)
}


## --- TRANSFORM DATA SETS ---

transform_datasets <- function (Data, log10trans=NULL, logtrans=NULL, sqrttrans=NULL) {
  ## --- Transform selected repsonse variables

  ## --- Log10 function (modified base 10 log function)
  ## y' = y            , y <= 1 
  ## y' = log10(y) + 1 , y  > 1
  if (!is.null(log10trans)) {
    for (i in 1:length(log10trans)) {
      k <- log10trans[i]
      Data[[k]]$transform <- "log10(y)+1"
      Data[[k]]$data_name <- paste (Data[[k]]$data_name, "-", Data[[k]]$transform)
      Data[[k]]$y         <- Log10 (Data[[k]]$y)
    }
  }
  
  ## --- log function
  ## y' = log(y + 1) 
  if (!is.null(logtrans)) {
    for (i in 1:length(logtrans)) {
      k <- logtrans[i]
      Data[[k]]$transform <- "log (y + 1)"
      Data[[k]]$data_name <- paste (Data[[k]]$data_name, "-", Data[[k]]$transform)
      Data[[k]]$y         <- log (Data[[k]]$y + 1)
    }
  }

  ## --- Sqrt function
  ## y' = sqrt(y)
  if (!is.null(sqrttrans)) {
    for (i in 1:length(sqrttrans)) {
      k <- sqrttrans[i]
      Data[[k]]$transform <- "sqrt"
      Data[[k]]$data_name <- paste (Data[[k]]$data_name, "-", Data[[k]]$transform)
      Data[[k]]$y         <- sqrt (Data[[k]]$y)
    }
  }
  
  ## --- Return data
  return (Data)
}

Log10 <- function (y) {
  ## --- Modified base 10 log function
  ## y' = y            , y <= 1 
  ## y' = log10(y) + 1 , y  > 1

  ## Transform data
  y[y>1] <- log10 (y[y>1]) + 1

  ## Return data
  return (y)
}

rinversegaussian <- function (n, mu, phi) {
  ## --- Simulate from an inverse gaussian

  ## --- Check length of mu
  if (length(mu)==1) { mu <- rep(mu, n)}
  if (length(mu) != n) { stop ("length of mu must be 1 or n!") }
    
  ## Sample from a normal distribution with a mean of 0 and 1 standard deviation
  v  <- stats::rnorm(n)

  ## Create possible return value 
  y  <-  v^2
  x  <-  mu + (0.5 * phi * mu^2 * y) - (0.5 * mu * phi) * sqrt(4*mu*y/phi + mu^2*y^2)

  ## Perform test and set return value
  Rand <-  stats::runif(n)
  Test <- Rand <= mu/(mu+x)
  Result <- mu^2/x
  Result[Test] <- x[Test]
  
  ## --- Return result
  return (Result)
}

dinversegaussian  <- function (y, mu, phi, log=TRUE) {
  ## --- Denisty of inverse gausssian

  ## --- Stop if y or parameters are non-positive
  if (any(y   <  0)) { stop ("y must be positive!")   } 
  if (any(mu  <= 0)) { stop ("mu must be positive!")  }
  if (any(phi <= 0)) { stop ("phi must be positive!") }
  
  ## --- Calculate denisty on log scale
  Result <- -(y - mu)^2/(2*y*phi*mu^2) - 0.5*(log(2*pi*phi) + 3*log(y))
  Result[y==0] <- -Inf
  Result[is.nan(Result)] <- -Inf
  
  ## --- Exponentiate if not in log scale  
  if (log==FALSE) {
    Result <- exp(Result)
  }
    
  ## --- Return density/log-density
  return (Result)
}

pinversegaussian <- function (q, mu, phi) {
  ## --- Calculate CDF for inverse gaussian function

  ## --- Stop if q or parameters are non-positive
  if (any(q   <  0)) { stop ("q must be non-negative!") }
  if (any(mu  <  0)) { stop ("mu must be non-negative!")    }
  if (any(phi <= 0)) { stop ("phi must be positive")    }
  
  ## --- Partial calculations
  t <- q/mu
  v <- sqrt(q * phi)

  ## --- Calculate cdf
  Result <- stats::pnorm((t - 1)/v) + exp(2/(mu * phi)) * stats::pnorm(-(t + 1)/v)
  ## --- CDF is zero if q is zero
  Result[q==0] <- 0
  Result[is.nan(Result)] <- 0
  
  ## --- Return cdf
  return (Result)
}
PRIMER-e/senlm documentation built on Nov. 20, 2022, 2:38 a.m.