R/smoothArea.R

Defines functions smoothArea

Documented in smoothArea

#' Smooth via area level model
#' 
#' Generates small area estimates  by smoothing direct estimates using an area 
#' level model
#'
#' @param formula an object of class 'formula' describing the model to be fitted. 
#'   If direct.est is specified, the right hand side of the formula is not necessary.
#' @param domain formula specifying variable containing domain labels
#' @param design an object of class "svydesign" containing the data for the model
#' @param responseType of the response variable, currently supports 'binary' (default with logit link function) or 'gaussian'.
#' @param Amat adjacency matrix for the regions. If set to NULL, the IID spatial effect will be used.
#' @param direct.est data frame of direct estimates, with first column containing domain, second column containing direct estimate, and third column containing variance of direct estimate. 
#' @param X.area areal covariates data frame. One of the column names needs to match 'domain', in order to be linked to the data input. Currently only supporting time-invariant domain-level covariates.
#' @param domain.size domain size data frame. One of the column names needs to match 'domain' in order to be linked to the data input and there must be a size column containing domain sizes.
#' @param pc.u 	hyperparameter U for the PC prior on precisions.
#' @param pc.alpha hyperparameter alpha for the PC prior on precisions.
#' @param pc.u.phi hyperparameter U for the PC prior on the mixture probability phi in BYM2 model.
#' @param pc.alpha.phi hyperparameter alpha for the PC prior on the mixture probability phi in BYM2 model.
#' @param CI 	the desired posterior credible interval to calculate
#' @param n.sample number of draws from posterior used to compute summaries
#' @param var.tol tolerance parameter; if variance of an area's direct estimator is below this value, that direct estimator is dropped from model
#'
#' @return A list with elements
#' \item{direct.est}{direct estimates}
#' \item{s.dir.iid.fit}{fitted INLA object for iid domain effects model}
#' \item{s.dir.iid.est}{non-spatial smoothed estimates}
#' \item{s.dir.sp.fit}{fitted INLA object for spatial domain effects model}
#' \item{s.dir.sp.est}{spatially smoothed estimates (if adjacency matrix provided)}
#' 
#' @export
#'
#' @examples
#' \dontrun{
#' library(survey)
#' data(DemoData2)
#' data(DemoMap2)
#' des0 <- svydesign(ids = ~clustid+id, strata = ~strata,
#'                   weights = ~weights, data = DemoData2, nest = T)
#' Xmat <- aggregate(age~region, data = DemoData2, FUN = mean)
#' 
#' EXAMPLE 1: Continuous response model
#' cts.res <- smoothArea(tobacco.use ~ 1, domain = ~region,
#'                       Amat = DemoMap2$Amat, design = des0,
#'                       pc.u = 1,
#'                       pc.alpha = 0.01,
#'                       pc.u.phi = 0.5,
#'                       pc.alpha.phi = 2/3)
#'                       
#' EXAMPLE 2: Including area level covariates
#' cts.cov.res <- smoothArea(tobacco.use ~ age, domain = ~region,
#'                           Amat = DemoMap2$Amat, design = des0,
#'                           X.area = Xmat,
#'                           pc.u = 1,
#'                           pc.alpha = 0.01,
#'                           pc.u.phi = 0.5,
#'                           pc.alpha.phi = 2/3)
#'                           
#' EXAMPLE 3: Binary response model
#' bin.res <- smoothArea(tobacco.use ~ 1, domain = ~region,
#'                       responseType = "binary",
#'                       Amat = DemoMap2$Amat, design = des0,
#'                       pc.u = 1,
#'                       pc.alpha = 0.01,
#'                       pc.u.phi = 0.5,
#'                       pc.alpha.phi = 2/3)
#'                       
#' EXAMPLE 4: Including area level covariates in binary response model
#' bin.cov.res <- smoothArea(tobacco.use ~ age, domain = ~region,
#'                           responseType = "binary",
#'                           Amat = DemoMap2$Amat, design = des0,
#'                           X.area = Xmat,
#'                           pc.u = 1,
#'                           pc.alpha = 0.01,
#'                           pc.u.phi = 0.5,
#'                           pc.alpha.phi = 2/3)
#'}
#'
smoothArea <- function(formula,
                       domain,
                       design = NULL,
                       responseType = c("gaussian", "binary")[1],
                       Amat = NULL,
                       direct.est = NULL,
                       X.area = NULL,
                       domain.size = NULL,
                       pc.u = 1,
                       pc.alpha = 0.01, 
                       pc.u.phi = 0.5,
                       pc.alpha.phi = 2/3,
                       CI = .95, 
                       n.sample = 250,
                       var.tol = 1e-10) {
  

  # SETUP
  domain.var <- all.vars(domain)[1]
  resp.frm <- as.formula(paste0("~", all.vars(update(formula, . ~ NULL))[1]))
  cov.frm <- update(formula, NULL ~ .)
  
  out <- list()

  if (is.null(direct.est)) {
    # calculate direct estimates
    if (!is.null(domain.size)) {
      dir.est <- survey::svyby(resp.frm, domain, design = design, survey::svytotal, na.rm = T)
      dir.est$domain.size <- domain.size$size[match(dir.est[, 1], domain.size[[domain.var]])]
      dir.est[, 2] = dir.est[, 2] / dir.est$domain.size
      dir.est[, 3] = (dir.est[, 3] / dir.est$domain.size) ^ 2
      dir.est <- dir.est[, 1:3]
    } else {
      dir.est <- survey::svyby(resp.frm, domain, design = design, survey::svymean, na.rm = T)
      dir.est[, 3] <- dir.est[, 3] ^ 2
    }
    
    
  } else {
    # assumes structure of direct.est!!!!
    dir.est <- direct.est
  }
  rownames(dir.est) <- NULL
  colnames(dir.est) <- c("domain", "dir.est", "dir.est.var")
  out$direct.est <- dir.est
  
  # remove any areas with zero sampling variances
  mod.dat <- dir.est
  mod.dat$dir.est <- ifelse(mod.dat$dir.est.var > var.tol, mod.dat$dir.est, NA)
  mod.dat$dir.est.var <- ifelse(mod.dat$dir.est.var > var.tol, mod.dat$dir.est.var, NA)
  mod.dat$dir.est.prec <- 1 / mod.dat$dir.est.var
  mod.dat$domain.id <- 1:nrow(mod.dat)
  if(!is.null(X.area)) {
    mod.X.area <- X.area
    mod.X.area$domain <- X.area[[domain.var]]
    mod.X.area <- mod.X.area[, names(mod.X.area) != domain.var]
    mod.dat <- merge(mod.dat, mod.X.area,  by = "domain")
  }
  mod.dat <- mod.dat[match(1:nrow(mod.dat), mod.dat$domain.id), ]
  mm.area <- stats::model.matrix(cov.frm, mod.dat)
  
  h <- function(x) x
  h.inv <- function(x) x
  if (responseType == "binary") {
    h <- function(x) logit(x)
    h.inv <- function(x) expit(x)
    mod.dat$dir.est.prec <- 
      (mod.dat$dir.est^2*(1-mod.dat$dir.est)^2) / mod.dat$dir.est.var
    mod.dat$dir.est <- h(mod.dat$dir.est)

  }
  mod.dat$scale <- mod.dat$dir.est.prec
  
  s.dir.ftxt <- 
    paste("dir.est ~ 1")
  if (length(all.vars(cov.frm)) > 0) {
    s.dir.ftxt <- 
      paste(s.dir.ftxt, paste(all.vars(cov.frm), collapse = " + "), sep = " + ")
  }
  s.dir.iid.ftxt <- 
    paste0(s.dir.ftxt, " + f(domain.id, model = 'iid', hyper = hyperpc.iid.int)")
  # set priors
  hyperpc.iid.int <- list(
    prec = list(prior = "pc.prec",
                param = c(pc.u , pc.alpha))
  )
  # SMOOTHED DIRECT w/ IID RE
  s.dir.iid.fit <-
    INLA::inla(as.formula(s.dir.iid.ftxt),
               family = "gaussian", data = mod.dat, 
               scale = mod.dat$dir.est.prec,
               control.family = 
                 list(hyper = list(prec = list(initial= log(1), fixed= TRUE))),
               control.predictor = list(compute = TRUE),
               control.compute=list(config = TRUE))
  sample.id <-c(list(domain.id = 1:nrow(mod.dat)),
                as.list(setNames(rep(1, ncol(mm.area)), colnames(mm.area))))
  s.dir.iid.sample <-
    INLA::inla.posterior.sample(n = n.sample, s.dir.iid.fit, sample.id)
  fe.idx <- grep(colnames(mm.area)[1], rownames(s.dir.iid.sample[[1]]$latent))
  fe.idx <- fe.idx:(fe.idx + ncol(mm.area) - 1)
  s.dir.iid.mat <-
    do.call(cbind, lapply(s.dir.iid.sample,
                          function(x) x$latent[1:nrow(mod.dat)] +
                            mm.area %*% x$latent[fe.idx]))

  s.dir.iid.mat <- h.inv(s.dir.iid.mat)
  out$s.dir.iid.fit <- s.dir.iid.fit

  out$s.dir.iid.est <- 
    data.frame(domain = mod.dat$domain,
               mean = rowMeans(s.dir.iid.mat),
               median = apply(s.dir.iid.mat, 1,
                              function(x) median(x, na.rm = T)),
               var = apply(s.dir.iid.mat, 1, var),
               lower = apply(s.dir.iid.mat, 1,
                             function(x) quantile(x, (1-CI)/2, na.rm = T)),
               upper = apply(s.dir.iid.mat, 1,
                             function(x) quantile(x, 1-(1-CI)/2, na.rm = T)),
               method = paste0("Smoothed Direct: IID"))
  
  # SMOOTHED DIRECT w/ BYM2 RE
  if (!is.null(Amat)) {
    mod.dat$domain.id <- match(mod.dat$domain, rownames(Amat))
    mod.dat <- mod.dat[match(1:nrow(mod.dat), mod.dat$domain.id), ]
    mm.area <- model.matrix(cov.frm, mod.dat)
    hyperpc.bym.int <- list(
      prec = list(prior = "pc.prec", param = c(pc.u , pc.alpha)),  
      phi = list(prior = 'pc', param = c(pc.u.phi , pc.alpha.phi))
    )
    s.dir.sp.ftxt <- 
      paste0(s.dir.ftxt,
             "+ f(domain.id, model = 'bym2', graph = Amat,", 
             "hyper = hyperpc.bym.int, scale.model = TRUE)")
    s.dir.sp.fit <-
      INLA::inla(as.formula(s.dir.sp.ftxt),
                 family = "gaussian", data = mod.dat, 
                 scale = mod.dat$dir.est.prec,
                 control.family = 
                   list(hyper = list(prec = list(initial= log(1), fixed= TRUE))),
                 control.predictor = list(compute = TRUE),
                 control.compute=list(config = TRUE))
    sample.id <-c(list(domain.id = 1:nrow(Amat)),
                  as.list(setNames(rep(1, ncol(mm.area)), colnames(mm.area))))
    s.dir.sp.sample <-
      INLA::inla.posterior.sample(n = n.sample, s.dir.sp.fit, sample.id)
    fe.idx <- grep(colnames(mm.area)[1], rownames(s.dir.sp.sample[[1]]$latent))
    fe.idx <- fe.idx:(fe.idx + ncol(mm.area) - 1)
    s.dir.sp.mat <-
      do.call(cbind, lapply(s.dir.sp.sample,
                            function(x) x$latent[1:nrow(Amat)] +
                              mm.area %*% x$latent[fe.idx]))
    s.dir.sp.mat <- h.inv(s.dir.sp.mat)
    out$s.dir.sp.fit <- s.dir.sp.fit
    out$s.dir.sp.est <- 
      data.frame(domain = mod.dat$domain,
                 mean = rowMeans(s.dir.sp.mat),
                 median = apply(s.dir.sp.mat, 1,
                                function(x) median(x, na.rm = T)),
                 var = apply(s.dir.sp.mat, 1, var),
                 lower = apply(s.dir.sp.mat, 1,
                               function(x) quantile(x, (1-CI)/2, na.rm = T)),
                 upper = apply(s.dir.sp.mat, 1,
                               function(x) quantile(x, 1-(1-CI)/2, na.rm = T)),
                 method = paste0("Smoothed Direct: BYM2"))
    out$s.dir.sp.est <- 
      out$s.dir.sp.est[match(out$direct.est$domain, out$s.dir.sp.est$domain),]
    
  }
  return(out)
}

Try the SUMMER package in your browser

Any scripts or data that you put into this service are public.

SUMMER documentation built on July 8, 2022, 9:05 a.m.