R/simulate.R

Defines functions rsglmm

Documented in rsglmm

######################################################################
##
### Commentary: Functions to simulate from the model.
##
######################################################################

##' Simulate from a variety of spatial models.
##'
##' The spatial Gaussian random field is simulated using the Cholesky
##' decomposition of the covariance matrix.
##'
##' The sample from a quasi distribution uses a hack which matches the
##' mean and the variance of the distribution. See the source code for
##' details.
##' @title Simulation from a spatial model
##' @param n The number of instances to simulate
##' @param formula A representation of the model in the form
##'   \code{response ~ terms}. The LHS can be omitted. If the LHS
##'   exists, it can be of the form \code{y}, \code{y|z}, or sums of
##'   terms at either side of the \code{|} to specify the names of the
##'   variables to include in the data frame.
##' @param family The distribution of the data to simulate from.
##' @param data An optional data frame containing the variables in the
##'   model.
##' @param weights An optional vector of weights. Number of replicated
##'   samples for Gaussian and gamma, number of trials for binomial,
##'   time length for Poisson.
##' @param subset An optional set of indices. Simulations will be
##'   provided for those locations only.
##' @param offset See \code{\link[stats]{lm}}.
##' @param atsample A formula of the form \code{~ Xcoord + Ycoord}
##'   specifying the sampled locations.
##' @param beta A vector of the regressor coefficents to use.
##' @param linkp The link function parameter.
##' @param phi The spatial range parameter.
##' @param omg The relative nugget parameter.
##' @param kappa The spatial smoothness parameter.
##' @param ssq The partial sill parameter.
##' @param corrfcn The correlation function to use.
##' @param longlat How to compute the distance between locations. If
##'   \code{FALSE}, Euclidean distance, if \code{TRUE} Great Circle
##'   distance. See \code{\link[sp]{spDists}}.
##' @param dispersion The fixed dispersion parameter. When this is not
##'   1 and the sample is from a binomial or a Poisson distribution,
##'   no such distribution exists so an approximate sample is
##'   returned. Use with caution.
##' @param returnGRF Whether to return the simulate Gaussian random
##'   field as well.
##' @param warndisp Whether to warn when sampling from a quasi
##'   distribution. This is the case for binomial and Poisson when the
##'   dispersion is not one.
##' @return A data frame containing the predictors, sampling
##'   locations, optional weights, and samples.
##' @name rsglmm
##' @examples \dontrun{
##' n <- 100
##' beta <- c(-2, 1)
##' phi <- .2
##' omg <- .3
##' linkp <- 0
##' ssq <- 1
##' l <- rep(10, n)
##' corrf <- "matern"
##' kappa <- .5
##' family <- "poisson"
##' Xcoord <- runif(n)
##' Ycoord <- runif(n)
##' f <- Xcoord + Ycoord
##' formula <- y|z ~ f
##' mydata <- rsglmm(1, formula, family, weights = l,
##'                  atsample = ~ Xcoord + Ycoord, beta = beta, linkp = linkp,
##'                  phi = phi, omg = omg, kappa = kappa, ssq = ssq,
##'                  corrfcn = corrf, returnGRF = TRUE)
##' }
##' @export
##' @importFrom stats rnorm rbinom rpois rgamma rbeta model.frame
##'   update as.formula model.offset
##' @importFrom sp spDists
rsglmm <- function(n, formula,
                   family = "gaussian",
                   data, weights, subset, offset, atsample,
                   beta, linkp, phi, omg, kappa, ssq,
                   corrfcn = "matern",
                   longlat = FALSE, dispersion = 1, returnGRF = FALSE,
                   warndisp = TRUE) {

  ## Family
  ifam <- .geoBayes_family(family)
  if (ifam) {
    family <- .geoBayes_models$family[ifam]
  } else {
    stop ("You cannot simulate transformed Gaussian using this function.")
  }

  ## Logical input
  returnGRF <- as.logical(returnGRF)
  if (length(returnGRF) != 1 || is.na(returnGRF))
    stop ("Input returnGRF must be a single logical value.")
  warndisp <- as.logical(warndisp)
  if (length(warndisp) != 1 || is.na(warndisp))
    stop ("Input warndisp must be a single logical value.")

  ## Check dispersion input
  dispersion <- as.double(dispersion)
  if (dispersion <= 0) stop ("Invalid argument dispersion")

  ## Check the link parameter
  nu <- .geoBayes_getlinkp(linkp, ifam)

  ## kappa
  if (missing(kappa)) kappa <- NA

  ## Response name
  formula <- as.formula(formula)
  if (length(formula) == 2) { # No LHS
    ## RHS <- formula[[2]]
    nmy <- "y"
    nmz <- "z"
  } else { # LHS names provided
    LHS <- formula[[2]]
    ## RHS <- formula[[3]]
    if ("|" %in% all.names(LHS)) { # LHS contains bar
      nmy <- all.vars(LHS[[2]])    # Before the bar
      nmz <- all.vars(LHS[[3]])    # After the bar
    } else {
      nmy <- all.vars(LHS)
      nmz <- "z"
    }
  }
  formula0 <- update(formula, NULL ~ .)

  ## Design matrix and data
  if (missing(data)) data <- environment(formula)
  mfc <- match.call(expand.dots = FALSE)
  m <- match(c("formula", "data", "subset", "weights", "offset"),
             names(mfc), 0L)
  mfc <- mfc[c(1L, m)]
  mfc$formula <- formula0
  mfc$drop.unused.levels <- TRUE
  mfc$na.action <- "na.fail"
  mfc[[1L]] <- quote(stats::model.frame)
  mf <- eval(mfc, parent.frame())
  mt <- attr(mf, "terms")
  FF <- model.matrix(mt,mf)
  nloc <- nrow(FF)
  offset <- as.vector(model.offset(mf))
  if (!is.null(offset)) {
    if (length(offset) != nloc) 
      stop(gettextf("number of offsets is %d, should equal %d (number of observations)", 
                    length(offset), nloc), domain = NA)
  }

  ## Weights
  weightsc <- mfc$weights
  noweights <- is.null(weightsc)
  if (noweights) {
    ll <- rep.int(1, nloc)
    nmweights <- weights <- NULL
  } else {
    ll <- model.weights(mf)
    if (any(!is.finite(ll))) {
      stop ("Non-finite values in the weights")
    } else if (any(ll <= 0)) {
      stop ("Non-positive weights not allowed")
    }
    weights <- eval(weightsc, envir = data)
    if (length(weightsc) > 1) {
      nmweights <- "(weights)"
    } else {
      nmweights <- as.character(weightsc)
    }
    weights <- list(weights)
    names(weights) <- nmweights
  }

  ## All locations
  atsample <- update(atsample, NULL ~ . + 0) # No response and no intercept
  mfatc <- mfc
  mfatc$weights = NULL
  mfatc$formula = atsample
  mfat <- eval(mfatc, parent.frame())
  loc <- as.matrix(mfat)
  if (!all(is.finite(loc))) stop ("Non-finite values in the locations")

  ## Simulate
  z <- randgrf(n, loc, beta, phi, omg, kappa, ssq, FF, offset, corrfcn, longlat)
  pary <- linkinv(z, linkp, family)
  if (warndisp && (dispersion != 1) &&
      grepl("^(binomial|poisson)(\\..+)?$", family))
    warning ("Sampling from a quasi distribution: results are approximate.")
  distny <- gsub("\\.[[:alnum:]]+", "", family)
  y <- if (n > 0)
         switch(distny,
                gaussian = randgauss(n, nloc, ll, pary, dispersion),
                binomial = randqbinom(n, nloc, ll, pary, dispersion),
                poisson = randqpois(n, nloc, ll, pary, dispersion),
                Gamma = randgamma(n, nloc, ll, pary, dispersion))

  ## Return data.frame
  varsnm <- unique(c(all.vars(formula0), all.vars(atsample)))
  varsfm <- as.formula(paste("~", paste(varsnm, collapse = "+"), "+0"))
  varscall <- list(quote(stats::model.frame), formula = varsfm, data = data)
  outdf <- eval(as.call(varscall), parent.frame())
  varsnm <- names(outdf)
  if (!noweights && !(nmweights %in% varsnm)) {
    outdf <- cbind(outdf, weights)
    varsnm <- c(varsnm, nmweights)
  }
  NN <- nrow(outdf)
  nmall <- make.names(c(varsnm, rep(nmy, length = n),
                        if(returnGRF) rep(nmz, length = n)), unique = TRUE)
  nmyz <- nmall[(ncol(outdf) + 1):(length(nmall))]
  if (missing(subset) || is.null(subset)) subset <- 1:NN
  dfy <- matrix(NA, NN, n)
  dfy[subset, ] <- y
  if (returnGRF) {
    dfz <- matrix(NA, NN, n)
    dfz[subset, ] <- z
  } else dfz <- NULL
  dfyz <- cbind(dfy, dfz)
  colnames(dfyz) <- nmyz
  outdf <- cbind(outdf, dfyz)
  outdf
}

##' @name rsglmm
##' @export
rstrga <- function(n, formula, data, weights, subset, offset, atsample,
                   beta, linkp, phi, omg, kappa, ssq,
                   corrfcn = "matern",
                   longlat = FALSE, dispersion = 1, returnGRF = FALSE) {

  call <- match.call()
  call[[1]] <- quote(rsglmm)
  call$family <- "gaussian"
  call$warndisp <- FALSE
  eval(call, parent.frame())
}


##' @name rsglmm
##' @export
rsgrf <- function (n, formula, data, subset, offset, atsample,
                   beta, phi, omg, kappa, ssq,
                   corrfcn = "matern",
                   longlat = FALSE) {
  
  ## kappa
  if (missing(kappa)) kappa <- NA

  ## Response name
  formula <- as.formula(formula)
  if (length(formula) == 2) { # No LHS
    ## RHS <- formula[[2]]
    nmz <- "z"
  } else { # LHS names provided
    LHS <- formula[[2]]
    ## RHS <- formula[[3]]
    if ("|" %in% all.names(LHS)) { # LHS contains bar
      nmz <- all.vars(LHS[[3]])    # After the bar
    } else {
      nmz <- all.vars(LHS)
    }
  }
  formula0 <- update(formula, NULL ~ .)

  ## Design matrix and data
  if (missing(data)) data <- environment(formula)
  mfc <- match.call(expand.dots = FALSE)
  m <- match(c("formula", "data", "subset", "weights", "offset"),
             names(mfc), 0L)
  mfc <- mfc[c(1L, m)]
  mfc$formula <- formula0
  mfc$drop.unused.levels <- TRUE
  mfc$na.action <- "na.fail"
  mfc[[1L]] <- quote(stats::model.frame)
  mf <- eval(mfc, parent.frame())
  mt <- attr(mf, "terms")
  FF <- model.matrix(mt,mf)
  nloc <- nrow(FF)
  offset <- as.vector(model.offset(mf))
  if (!is.null(offset)) {
    if (length(offset) != nloc) 
      stop(gettextf("number of offsets is %d, should equal %d (number of observations)", 
                    length(offset), nloc), domain = NA)
  }

  ## All locations
  atsample <- update(atsample, NULL ~ . + 0) # No response and no intercept
  mfatc <- mfc
  mfatc$weights = NULL
  mfatc$formula = atsample
  mfat <- eval(mfatc, parent.frame())
  loc <- as.matrix(mfat)
  if (!all(is.finite(loc))) stop ("Non-finite values in the locations.")

  z <- randgrf(n, loc, beta, phi, omg, kappa, ssq, FF, offset, corrfcn, longlat)

  ## Return data.frame
  varsnm <- unique(c(all.vars(formula0), all.vars(atsample)))
  formulavars <- as.formula(paste("~", paste(varsnm, collapse = "+"), "+0"))
  varscall <- list(quote(stats::model.frame),
                   formula = formulavars, data = data)
  outdf <- eval(as.call(varscall),parent.frame())
  varsnm <- names(outdf)
  NN <- nrow(outdf)
  nmall <- make.names(c(varsnm, rep(nmz, length = n)), unique = TRUE)
  nmz <- nmall[(ncol(outdf) + 1):(length(nmall))]
  if (missing(subset) || is.null(subset)) subset <- 1:NN
  dfz <- matrix(NA, NN, n)
  dfz[subset, ] <- z
  colnames(dfz) <- nmz
  outdf <- cbind(outdf, dfz)
  outdf
}


##' @useDynLib geoBayes spcorr
randgrf <- function (n, loc, beta, phi, omg, kappa, ssq, FF, offset = NULL, 
                     corrfcn = "matern",
                     longlat = FALSE) {

  ## Check sample size
  n <- as.integer(n)
  if (n < 0) stop ("Negative sample size n entered.")

  ## Logical input
  longlat <- as.logical(longlat)
  if (length(longlat) != 1 || is.na(longlat))
    stop ("Input longlat must be a single logical value.")

  ## Correlation function
  icf <- .geoBayes_correlation(corrfcn)
  corrfcn <- .geoBayes_corrfcn$corrfcn[icf]
  kappa <- .geoBayes_getkappa(kappa, icf)
  phi <- as.double(phi)
  if (phi < 0) stop ("Input phi must >= 0")
  omg <- as.double(omg)
  if (omg < 0) stop ("Input omg must >= 0")

  ## Check beta, ssq
  beta <- as.vector(beta)
  ssq <- as.double(ssq)
  if (ssq <= 0) stop ("Non positive ssq entered.")

  ## Check loc
  loc <- as.matrix(loc)
  if (!all(is.finite(loc))) stop ("Non-finite values in the locations.")
  if (corrfcn == "spherical" && ncol(loc) > 3) {
    stop ("Cannot use the spherical correlation for dimensions
grater than 3.")
  }

  ## Check model matrix
  FF <- as.matrix(FF)
  if (!all(is.finite(FF))) stop ("Non-finite values in the design matrix.")
  nloc <- nrow(FF)
  p <- ncol(FF)
  if (length(beta) != p)
    stop ("The length of beta does not match the number of predictors.")

  ## Covariance matrix
  dm <- sp::spDists(loc, longlat = longlat)
  dmup <- dm[upper.tri(dm)]
  ndm <- as.integer(length(dmup))
  spcorr <- .Fortran("spcorr", dmup, as.double(phi), as.double(kappa),
                     ndm, icf, PACKAGE = "geoBayes")[[1]]
  Sg <- diag(ssq*(1 + omg), nloc, nloc)
  Sg[upper.tri(Sg)] <- ssq*spcorr
  Sgch <- chol(Sg)

  ## Simulate
  meanz <- as.vector(FF %*% beta)
  z <- matrix(rnorm(n*nloc), nloc, n)
  z <- meanz + crossprod(Sgch, z)
  if (!is.null(offset)) z <- z + offset
  z
}

randgauss <- function (n, ns, weight, par, dispersion) {
  mean <- weight*par
  sd <- sqrt(weight*dispersion)
  y <- matrix(rnorm(n*ns, mean, sd), ns, n)
  y
}

randqbinom <- function (n, ns, weight, par, dispersion) {
  if (dispersion == 1) {
    y <- matrix(rbinom(n*ns, weight, par), ns, n)
  } else {
    ## y = dispersion*Bin(m/dispersion,p) +
    ## dispersion*eps*Beta(1+p,2-p)
    mt <- weight/dispersion
    mtm <- floor(mt)
    y <- matrix(rbinom(ns*n, mtm, par), ns, n)*dispersion
    ii <- mt > mtm
    ni <- sum(ii)
    if (ni > 0) {
      eps <- weight[ii] %% dispersion
      bet1 <- 1 + par[ii]
      bet2 <- 2 - par[ii]
      betr <- matrix(rbeta(ni*n, bet1, bet2), ni, n)*eps
      y[ii, ] <- y[ii, ] + betr
    }
  }
  y
}

randqpois <- function (n, ns, weight, par, dispersion) {
  if (dispersion == 1) {
    mean <- weight*par
    y <- matrix(rpois(n*ns, mean), ns, n)
  } else {
    mean <- weight*par/dispersion
    y <- matrix(rpois(n*ns, mean), ns, n)*dispersion
  }
  y
}

randgamma <- function (n, ns, weight, par, dispersion) {
  scale <- par*dispersion
  shape <- weight/dispersion
  y <- matrix(rgamma(n*ns, shape = shape, scale = scale), n, ns)
  y
}

Try the geoBayes package in your browser

Any scripts or data that you put into this service are public.

geoBayes documentation built on Aug. 21, 2023, 9:08 a.m.