R/fitspace.R

Defines functions smoothSurvey

Documented in smoothSurvey

#' Fit space-time smoothing models for a generic outcome from complex surveys.
#'
#' This function calculates the direct estimates by region and fit a simple spatial smoothing model to the direct estimates adjusting for survey design.
#' Normal or binary variables are currently supported. For binary variables, the logit transformation is performed on the direct estimates of probabilities, and a Gaussian additive model is fitted on the logit scale using INLA.
#' 
#' The function \code{smoothSurvey} replaces the previous function name \code{fitGeneric} (before version 1.0.0).
#' 
#' @param data The input data frame. The input data  with column of the response variable (\code{responseVar}), region ID (\code{regionVar}), stratification within region (\code{strataVar}), and cluster ID (\code{clusterVar}).
#' \itemize{ 
#' \item For area-level model, the data frame consist of survey observations and corresponding survey weights (\code{weightVar}). 
#' \item For unit-level model and \code{is.agg = FALSE}, the data frame should consist of aggregated counts by clusters (for binary responses), or any cluster-level response (for continuous response). For binary response (\code{responseType = 'binary'}), the beta-binomial model will be fitted for cluster-level counts. For continuous response (\code{responseType = 'gaussian'}), a Gaussian smoothing model will be fitted on the cluster-level response. 
#' \item For unit-level model and \code{is.agg = TRUE}, the data frame should be the same as in the area-level model. For binary response (\code{responseType = 'binary'}), the beta-binomial model will be fitted for cluster-level counts aggregated internally. For continuous response (\code{responseType = 'gaussian'}), the nested error model will be fitted on unit-level response.
#' }
#' @param geo Deprecated argument from early versions.
#' @param Amat Adjacency matrix for the regions. If set to NULL, the IID spatial effect will be used.
#' @param region.list a vector of region names. Only used when IID model is used and the adjacency matrix not specified. This allows the output to include regions with no sample in the data. When the spatial adjacency matrix is specified, the column names of the adjacency matrix will be used to determine region.list. If set to NULL, all regions in the data are used.
#' @param X Areal covariates data frame. One of the column name needs to match the \code{regionVar} specified in the function call, in order to be linked to the data input. Currently only supporting time-invariant region-level covariates.
#' @param X.unit Column names of unit-level covariates. When \code{X.unit} is specified, a nested error model will be fitted with unit-level IID noise, and area-level predictions are produced by plugging in the covariate specified in the \code{X} argument. When \code{X} is not specified, the empirical mean of each covariate will be used. This is only implemented for continuous response with the Gaussian likelihood model and unit-level model. 
#' @param responseType Type of the response variable, currently supports 'binary' (default with logit link function) or 'gaussian'. 
#' @param responseVar the response variable
#' @param strataVar the strata variable used in the area-level model. 
#' @param weightVar the weights variable
#' @param regionVar Variable name for region.
#' @param clusterVar Variable name for cluster. For area-level model, this should be a formula for cluster in survey design object, e.g., '~clusterID + householdID'. For unit-level model, this should be the variable name for cluster unit.
#' @param pc.u hyperparameter U for the PC prior on precisions.
#' @param pc.alpha hyperparameter alpha for the PC prior on precisions.
#' @param pc.u.phi hyperparameter U for the PC prior on the mixture probability phi in BYM2 model.
#' @param pc.alpha.phi hyperparameter alpha for the PC prior on the mixture probability phi in BYM2 model.
#' @param CI the desired posterior credible interval to calculate
#' @param formula a string of user-specified random effects model to be used in the INLA call
#' @param timeVar The variable indicating time period. If set to NULL then the temporal model and space-time interaction model are ignored.
#' @param time.model the model for temporal trends and interactions. It can be either "rw1" or "rw2".
#' @param include_time_unstruct  Indicator whether to include the temporal unstructured effects (i.e., shocks) in the smoothed estimates from cluster-level model. The argument only applies to the unit-level models. Default is FALSE which excludes all unstructured temporal components. If set to TRUE all  the unstructured temporal random effects will be included. 
#' @param type.st can take values 0 (no interaction), or 1 to 4, corresponding to the type I to IV space-time interaction.
#' @param direct.est data frame of direct estimates, with column names of response and region specified by \code{responseVar}, \code{regionVar}, and \code{timeVar}.  When \code{direct.est} is specified, it overwrites the \code{data} input. 
#' @param direct.est.var the column name corresponding to the variance of direct estimates.
#' @param is.unit.level logical indicator of whether unit-level model is fitted instead of area-level model. 
#' @param is.agg logical indicator of whether the input is at the aggregated counts by cluster. Only used for unit-level model and binary response variable.
#' @param strataVar.within the variable specifying within region stratification variable. This is only used for the unit-level model. 
#' @param totalVar the variable specifying total observations in \code{counts}. This is only used for the unit-level model when \code{counts} is specified. 
#' @param weight.strata a data frame with one column corresponding to \code{regionVar}, and columns specifying proportion of each strata for each region. This argument specifies the weights for strata-specific estimates. This is only used for the unit-level model. 
#' @param nsim number of posterior draws to take. This is only used for the unit-level model when \code{weight.strata} is provided. 
#' @param save.draws logical indicator of whether to save the full posterior draws.
#' @param smooth logical indicator of whether to perform smoothing. If set to FALSE, a data frame of direct estimate is returned. Only used when \code{is.unit.level} is FALSE.
#' @param ... additional arguments passed to \code{svydesign} function.
#' 
#' 
#' @return \item{HT}{Direct estimates}
#' \item{smooth}{Smoothed direct estimates}
#' \item{fit}{a fitted INLA object}
#' \item{CI}{input argument}
#' \item{Amat}{input argument}
#' \item{responseType}{input argument}
#' \item{formula}{INLA formula}
#' @seealso \code{\link{getDirectList}}, \code{\link{smoothDirect}}
#' @importFrom stats median quantile sd var aggregate as.formula
#' @author Zehang Richard Li 
#' @examples
#' \dontrun{
#' ##
#' ## 1. Area-level model with binary response
#' ##
#' 
#' data(DemoData2)
#' data(DemoMap2)
#' fit0 <- smoothSurvey(data=DemoData2,  
#' Amat=DemoMap2$Amat, responseType="binary", 
#' responseVar="tobacco.use", strataVar="strata", 
#' weightVar="weights", regionVar="region", 
#' clusterVar = "~clustid+id", CI = 0.95)
#' summary(fit0)
#' 
#' # if only direct estimates without smoothing is of interest
#' fit0.dir <- smoothSurvey(data=DemoData2,  
#' Amat=DemoMap2$Amat, responseType="binary", 
#' responseVar="tobacco.use", strataVar="strata", 
#' weightVar="weights", regionVar="region", 
#' clusterVar = "~clustid+id", CI = 0.95, smooth = FALSE)
#' 
#' # posterior draws can be returned with save.draws = TRUE
#' fit0.draws <- smoothSurvey(data=DemoData2,  
#' Amat=DemoMap2$Amat, responseType="binary", 
#' responseVar="tobacco.use", strataVar="strata", 
#' weightVar="weights", regionVar="region", 
#' clusterVar = "~clustid+id", CI = 0.95, save.draws = TRUE)
#' # notice the posterior draws are on the latent scale
#' head(fit0.draws$draws.est[, 1:10]) 
#' 
#' # Example with region-level covariates
#'  Xmat <- aggregate(age~region, data = DemoData2, 
#' 						FUN = function(x) mean(x))
#'  fit1 <- smoothSurvey(data=DemoData2,  
#'   Amat=DemoMap2$Amat, responseType="binary", 
#'   X = Xmat,
#'   responseVar="tobacco.use", strataVar="strata", 
#'   weightVar="weights", regionVar="region", 
#'   clusterVar = "~clustid+id", CI = 0.95)
#' 
#' # Example with using only direct estimates as input instead of the full data
#' direct <- fit0$HT[, c("region", "HT.est", "HT.var")]
#' fit2 <- smoothSurvey(data=NULL, direct.est = direct, 
#'                     Amat=DemoMap2$Amat, regionVar="region",
#'                     responseVar="HT.est", direct.est.var = "HT.var", 
#'                     responseType = "gaussian")
#' # Check it is the same as fit0
#' plot(fit2$smooth$mean, fit0$smooth$mean)
#' 
#' # Example with using only direct estimates as input, 
#' #   and after transformation into a Gaussian smoothing model
#' # Notice: the output are on the same scale as the input 
#' #   and in this case, the logit estimates.    
#' direct.logit <- fit0$HT[, c("region", "HT.logit.est", "HT.logit.var")]
#' fit3 <- smoothSurvey(data=NULL, direct.est = direct.logit, 
#'                Amat=DemoMap2$Amat, regionVar="region",
#'                responseVar="HT.logit.est", direct.est.var = "HT.logit.var",
#'                responseType = "gaussian")
#' # Check it is the same as fit0
#' plot(fit3$smooth$mean, fit0$smooth$logit.mean)
#' 
#' # Example with non-spatial smoothing using IID random effects
#' fit4 <- smoothSurvey(data=DemoData2, responseType="binary", 
#'        responseVar="tobacco.use", strataVar="strata", 
#'        weightVar="weights", regionVar="region", 
#'        clusterVar = "~clustid+id", CI = 0.95)
#' 
#' # Example with missing regions in the raw input
#' DemoData2.sub <- subset(DemoData2, region != "central")
#' fit.without.central <- smoothSurvey(data=DemoData2.sub,  
#'                          Amat=NULL, responseType="binary", 
#'                          responseVar="tobacco.use", strataVar="strata", 
#'                          weightVar="weights", regionVar="region", 
#'                          clusterVar = "~clustid+id", CI = 0.95)
#' fit.without.central$HT
#' fit.without.central$smooth
#' 
#' fit.without.central <- smoothSurvey(data=DemoData2.sub,  
#'                          Amat=NULL, region.list = unique(DemoData2$region),
#'                          responseType="binary", 
#'                          responseVar="tobacco.use", strataVar="strata", 
#'                          weightVar="weights", regionVar="region", 
#'                          clusterVar = "~clustid+id", CI = 0.95)
#' fit.with.central$HT
#' fit.with.central$smooth
#' 
#' # Using the formula argument, further customizations can be added to the 
#' #  model fitted. For example, we can fit the Fay-Harriot model with 
#' #  IID effect instead of the BYM2 random effect as follows.
#' #  The "region.struct" and "hyperpc1" are picked to match internal object 
#' #  names. Other object names can be inspected from the source of smoothSurvey.
#' fit5 <- smoothSurvey(data=DemoData2,  
#'        Amat=DemoMap2$Amat, responseType="binary", 
#'        formula = "f(region.struct, model = 'iid', hyper = hyperpc1)",
#'        pc.u = 1, pc.alpha = 0.01,
#'        responseVar="tobacco.use", strataVar="strata", 
#'        weightVar="weights", regionVar="region", 
#'        clusterVar = "~clustid+id", CI = 0.95)
#' # Check it is the same as fit4, notice the region order may be different
#' regions <- fit5$smooth$region
#' plot(fit4$smooth[match(regions, fit4$smooth$region),]$logit.mean, fit5$smooth$logit.mean)
#' 
#' ##
#' ## 2. Unit-level model with binary response  
#' ##
#' 
#' # For unit-level models, we need to create stratification variable within regions
#' data <- DemoData2
#' data$urbanicity <- "rural"
#' data$urbanicity[grep("urban", data$strata)] <- "urban"
#' 
#' # Beta-binomial likelihood is used in this model
#' fit6 <- smoothSurvey(data=data, 
#'   Amat=DemoMap2$Amat, responseType="binary", 
#'   X = Xmat, is.unit.level = TRUE,
#'   responseVar="tobacco.use", strataVar.within = "urbanicity", 
#'   regionVar="region", clusterVar = "clustid", CI = 0.95)
#' 
#' # We may use aggregated PSU-level counts as input as well
#' #    in the case of modeling a binary outcome 
#' data.agg <- aggregate(tobacco.use~region + urbanicity + clustid, 
#'                       data = data, FUN = sum)
#' data.agg.total <- aggregate(tobacco.use~region + urbanicity + clustid, 
#'                       data = data, FUN = length)
#' colnames(data.agg.total)[4] <- "total"
#' data.agg <- merge(data.agg, data.agg.total)
#' head(data.agg)
#' 
#' fit7 <- smoothSurvey(data=data.agg, 
#'   Amat=DemoMap2$Amat, responseType="binary", 
#'   X = Xmat, is.unit.level = TRUE, is.agg = TRUE,
#'   responseVar = "tobacco.use", strataVar.within = "urbanicity", 
#'   totalVar = "total", regionVar="region", clusterVar = "clustid", CI = 0.95)
#' 
#' # Check it is the same as fit6
#' plot(fit6$smooth$mean, fit7$smooth$mean)  
#' 
#' ##
#' ## 3. Area-level model with continuous response
#' ##
#' 
#' # The smoothing model is the same as area-level model with binary response
#' #  the continuous direct estimates are smoothed instead of 
#' #  their logit-transformed versions for binary response.
#' fit8 <- smoothSurvey(data=DemoData2, Amat=DemoMap2$Amat, 
#'        responseType="gaussian", responseVar="age", strataVar="strata", 
#'        weightVar="weights", regionVar="region", 
#'        pc.u.phi = 0.5, pc.alpha.phi = 0.5,
#'        clusterVar = "~clustid+id", CI = 0.95)
#' 
#' ##
#' ## 4. Unit-level model with continuous response  
#' ##    (or nested error models)
#' 
#' # The unit-level model assumes for each of the i-th unit,
#' #    Y_{i} ~ intercept + region_effect + IID_i
#' #    where IID_i is the error term specific to i-th unit
#' 
#' # When more than one level of cluster sampling is carried out, 
#' #   they are ignored here. Only the input unit is considered.
#' #   So here we do not need to specify clusterVar any more. 
#' fit9 <- smoothSurvey(data= data, 
#'   Amat=DemoMap2$Amat, responseType="gaussian", 
#'   is.unit.level = TRUE, responseVar="age", strataVar.within = NULL,
#'   regionVar="region", clusterVar = NULL, CI = 0.95)
#' 
#' # To compare, we may also model PSU-level responses. As an illustration, 
#' data.median <- aggregate(age~region + urbanicity + clustid, 
#'                       data = data, FUN = median)
#' 
#' fit10 <- smoothSurvey(data= data.median, 
#'   Amat=DemoMap2$Amat, responseType="gaussian", 
#'   is.unit.level = TRUE, responseVar="age", strataVar.within = NULL,
#'   regionVar="region", clusterVar = "clustid", CI = 0.95)
#' 
#' 
#' # To further incorporate within-area stratification
#' 
#' fit11 <- smoothSurvey(data = data, 
#'   Amat = DemoMap2$Amat, responseType = "gaussian", 
#'   is.unit.level = TRUE, responseVar="age", strataVar.within = "urbanicity",
#'   regionVar = "region", clusterVar = NULL, CI = 0.95)  
#' 
#' # Notice the usual output is now stratified within each region
#' # The aggregated estimates require strata proportions for each region
#' # For illustration, we set strata population proportions below
#' prop <- data.frame(region = unique(data$region), 
#'                             urban = 0.3, 
#'                             rural = 0.7)
#' fit12 <- smoothSurvey(data=data, 
#'   Amat=DemoMap2$Amat, responseType="gaussian", 
#'   is.unit.level = TRUE, responseVar="age", strataVar.within = "urbanicity",
#'   regionVar="region", clusterVar = NULL, CI = 0.95,
#'   weight.strata = prop)  
#' 
#' # aggregated outcome
#' head(fit12$smooth.overall)
#' 
#' # Compare aggregated outcome with direct aggregating posterior means. 
#' # There could be small differences if only 1000 posterior draws are taken.
#' est.urb <- subset(fit11$smooth, strata == "urban")
#' est.rural <- subset(fit11$smooth, strata == "rural")
#' est.mean.post <- est.urb$mean * 0.3 + est.rural$mean * 0.7
#' plot(fit12$smooth.overall$mean, est.mean.post)
#' 
#' 
#' ##
#' ## 6. Unit-level model with continuous response and unit-level covariate 
#' ## 
#' 
#' # For area-level prediction, area-level covariate mean needs to be  
#' #   specified in X argument. And unit-level covariate names are specified
#' #   in X.unit argument.
#' 
#' set.seed(1)
#' sim <- data.frame(region = rep(c(1, 2, 3, 4), 1000),
#'                    X1 = rnorm(4000), X2 = rnorm(4000))
#' Xmean <- aggregate(.~region, data = sim, FUN = sum)
#' sim$Y <- rnorm(4000, mean = sim$X1 + 0.3 * sim$X2 + sim$region)
#' samp <- sim[sample(1:4000, 20), ]
#' fit.sim <- smoothSurvey(data=samp , 
#'                   X.unit = c("X1", "X2"),
#'                   X = Xmean, Amat=NULL, responseType="gaussian", 
#'                   is.unit.level = TRUE, responseVar="Y", regionVar = "region",  
#'                   pc.u = 1, pc.alpha = 0.01, CI = 0.95) 
#' 
#' }
#' @export


smoothSurvey <- function(data, geo = NULL, Amat = NULL, region.list = NULL, X = NULL, X.unit = NULL, responseType = c("binary", "gaussian")[1], responseVar, strataVar="strata", weightVar="weights", regionVar="region", clusterVar = "~v001+v002", pc.u = 1, pc.alpha = 0.01, pc.u.phi = 0.5, pc.alpha.phi = 2/3, CI = 0.95, formula = NULL, timeVar = NULL, time.model = c("rw1", "rw2")[1], include_time_unstruct = FALSE, type.st = 1, direct.est = NULL, direct.est.var = NULL, is.unit.level = FALSE, is.agg = FALSE, strataVar.within = NULL,  totalVar = NULL, weight.strata = NULL, nsim = 1000, save.draws = FALSE, smooth = TRUE, ...){

    svy <- TRUE
    if(is.null(responseVar)){
        stop("Response variable not specified")
    }
    if(is.null(responseType)){
        stop("responseType not specified")
    }  
    if(is.null(Amat) && smooth){
        message("No spatial adjacency matrix is specified. IID area random effect is used.")
    }
    is.iid.space <- FALSE
    responseType <- tolower(responseType)
    
    # there is no aggregated input for Gaussian models
    if(responseType == "gaussian") is.agg <- FALSE

    if(responseType == "binary" && !is.null(X.unit)){
        X.unit <- NULL
        warning("Unit-level covariates not implemented for binary response variable. Set X.unit = NULL.")
    }
    if(is.unit.level == FALSE && !is.null(X.unit)){
        X.unit <- NULL
        warning("Area-level model is fitted. Set X.unit = NULL.")
    }
    if(is.agg && !is.null(X.unit)){
        X.unit <- NULL
        warning("Unit-level covariates cannot be used when input data is aggregated to cluster level (is.agg = TRUE). Set X.unit = NULL.")
    }

    # whether we are fitting nested error model
    # For future update, we can also include two-fold nested error model by adding PSU-level effect
    if(responseType == "gaussian" && is.unit.level){
        is.nested <- TRUE
        clusterVar <- NULL
    }else{
        is.nested <- FALSE
    }

    if(is.nested && !is.null(X.unit) && !is.null(timeVar)){
        stop("Unit-level nested error model with covariates is not implemented with temporal components yet.")
    }
    if(is.nested && !is.null(X.unit) && !is.null(strataVar.within)){
        stop("Unit-level nested error model with covariates is not implemented with stratification components yet.")
    } 

    ##---------------------------------------------------------------------##
    ## Unit level model: data processing
    ##---------------------------------------------------------------------##
    if(is.unit.level){
        
        if(responseType == "binary"){
            message("Fitting unit-level model.")
            msg <- "Unit-level model"
        }else{
            message("Fitting unit-level nested error model.")
            msg <- "Unit-level nested error model"
        }
        if(is.null(strataVar.within)){
            message("Within region stratification variable (strataVar.within) not defined. Unstratified model is fitted.")
            strataVar.within <- "strata0"
            data$strata0 <- 1
        }
        
        if(is.null(clusterVar) && !is.nested) stop("Cluster variable (clusterVar) not defined.")

        if(is.null(data)) stop("Survey dataset not defined.")
        data$response0 <- data[, responseVar]
        data$region0 <- as.character(data[, regionVar])
        if(is.null(Amat)){
            if(!is.null(region.list)){
                regions <- unique(region.list)
                if(sum(!data$region0 %in% regions) > 0){
                    stop("'region.list' is specified but there are regions in the data not in the 'region.list'.")
                }
            }else{
                regions <- unique(data$region0)
            }
            Amat <- matrix(0, length(regions), length(regions))
            colnames(Amat) <- rownames(Amat) <- regions
            is.iid.space <- TRUE
        }
        if(sum(!data$region0 %in% colnames(Amat)) > 0){
            stop("Exist regions in the data frame but not in Amat.")
        }
        data$region0 <- factor(data$region0, levels = colnames(Amat))
        data$strata0 <- factor(data[, strataVar.within])
        if(!is.null(clusterVar)) data$cluster0 <- data[, clusterVar]

        # check column names of covariates
        if(!is.null(X.unit)){
            if(sum(X.unit %in% colnames(data)) != length(X.unit)){
                stop("Exist columns specified X.unit but not in the data.")
            }
            if(!is.null(X) && sum(X.unit %in% colnames(X)) != length(X.unit)){
                stop("Exist columns specified X.unit but not in X.")
            }
            # if X not specified, using empirical mean
            if(is.null(X)){
                data.sub <- data[, c(X.unit, "region0")]
                X <- aggregate(.~region0, data = data.sub, FUN = function(x){mean(x, na.rm = TRUE)})
            }
        }

        # If input is not aggregated, and not fitting a nested model, then aggregate
        if(!is.agg && !is.nested){
            vars <- c("cluster0", "region0", "strata0")

            if(!is.null(timeVar)){
                data$time0 <- data[, timeVar]
                vars <- c(vars, "time0")
            }
            counts <- getCounts(data[, c(vars, "response0")], variables = 'response0', by = vars, drop=TRUE) 
            # if(responseType == "gaussian"){
            #     stop("Response variable is at SSU level. The function currently only supports PSU-level response.")
            #     counts[, "response0"] <- counts[, "response0"] / counts[, "total"]
            # }  
        # If input is aggregated then define total
        }else if(is.agg){
            if(!is.null(timeVar)){
                data$time0 <- data[, timeVar]
            }
            if(is.null(totalVar) && responseType == "binary"){
                stop("Which column correspond to cluster total is not specified")
            }
            data$total <- data[, totalVar]
            counts <- data
        # if fitting nested model
        }else{
            counts <- data
        }
                
        if(!is.null(weight.strata)){
            if(sum(!data$strata0 %in% colnames(weight.strata)) > 0) stop("Exist within-area strata (strataVar.within) not in the weight.strata data frame.")
            stratalist <- unique(data$strata0)
        }else{
            stratalist <- NULL
        }

    ##---------------------------------------------------------------------##
    ## End of Unit level model data processing
    ## 
    ## Area level model with direct estimate as input    
    ##---------------------------------------------------------------------##
    }else if(!is.null(direct.est)){
        msg <- "Area-level model using direct estimates as input"
        message("Using direct estimates as input instead of survey data.")
        data <- direct.est
        data$region0 <- as.character(direct.est[, regionVar])
        if(is.null(Amat)){
            if(!is.null(region.list)){
                regions <- unique(region.list)
                if(sum(!data$region0 %in% regions) > 0){
                    stop("'region.list' is specified but there are regions in the data not in the 'region.list'.")
                }
            }else{
                regions <- unique(data$region0)
            }
            Amat <- matrix(0, length(regions), length(regions))
            colnames(Amat) <- rownames(Amat) <- regions
            is.iid.space <- TRUE
        }
        if(sum(!data$region0 %in% colnames(Amat)) > 0){
            stop("Exist regions in the data frame but not in Amat.")
        }
        data$region0 <- factor(data$region0, levels = colnames(Amat))        
        data$response0 <- direct.est[, responseVar]
        if(is.null(direct.est.var)){
            stop("Need to specify column for the variance of direct estimates")
        }
        data$var0 <- direct.est[, direct.est.var]

    ##---------------------------------------------------------------------##
    ## End of Area level model with direct estimate as input 
    ## 
    ## Area level model with raw survey data input   
    ##---------------------------------------------------------------------##
    }else{
        msg <- "Area-level model using survey data as input"
        if(!is.data.frame(data)){
            stop("Input data needs to be a data frame")
        }
        if (is.null(strataVar)) {
            message("Strata not defined. Ignoring sample design")
            svy <- FALSE
        }
        if (is.null(clusterVar)){
            message("cluster not specified. Ignoring sample design")
            svy <- FALSE
        } 
        data$response0 <- data[, responseVar]
        data$region0 <- as.character(data[, regionVar])
        if(is.null(Amat)){
            if(!is.null(region.list)){
                regions <- unique(region.list)
                if(sum(!data$region0 %in% regions) > 0){
                    stop("'region.list' is specified but there are regions in the data not in the 'region.list'.")
                }
            }else{
                regions <- unique(data$region0)
            }
            Amat <- matrix(0, length(regions), length(regions))
            colnames(Amat) <- rownames(Amat) <- regions
            is.iid.space <- TRUE
        }
        if(sum(!data$region0 %in% colnames(Amat)) > 0) stop("Exist regions in the data frame but not in Amat.")
        data$region0 <- factor(data$region0, levels = colnames(Amat))   
        if(svy){
            data$weights0 <- data[, weightVar]
            data$strata0 <- data[, strataVar]
        }
    }

    ##---------------------------------------------------------------------##
    ## Check time variable and region names
    ##---------------------------------------------------------------------##    
    if(!is.null(timeVar)) data$time0 <- data[, timeVar]     


    if(is.null(rownames(Amat))){
        stop("Row names of Amat needs to be specified to region names.")
    }
    if(is.null(colnames(Amat))){
        stop("Column names of Amat needs to be specified to region names.")
    }
    if(sum(rownames(Amat) != colnames(Amat)) > 0){
        stop("Row and column names of Amat needs to be the same.")
    }

    ##---------------------------------------------------------------------##
    ## End of main data input process
    ## 
    ## Process covariate information
    ##---------------------------------------------------------------------##    

    if(!is.null(X)){
        if(regionVar %in% colnames(X)){
            region.col <- which(colnames(X) == regionVar)

        # for backward compatibility, default to first column being region ID
        }else{
            region.col <- 1
        }
        if(sum(X[, region.col] %in% colnames(Amat) == FALSE) > 0 ||
           sum(colnames(Amat) %in% X[,region.col] == FALSE) > 0){
            stop(paste0(colnames(X)[region.col], "is used as region identifier in the input covariate X. It does not match the region names in the data."))
        }
    }
    
 
    ##---------------------------------------------------------------------##
    ## End of covariate checking
    ## 
    ## Set up INLA 
    ##---------------------------------------------------------------------##    

    if(responseType == "binary"){
        FUN <- expit
    }else if(responseType == "gaussian"){
        FUN <- function(x){x}
    }
    if(!isTRUE(requireNamespace("INLA", quietly = TRUE))) {
         stop("You need to install the packages 'INLA'. Please run in your R terminal:\n  install.packages('INLA', repos=c(getOption('repos'), INLA='https://inla.r-inla-download.org/R/stable'), dep=TRUE)")
    }
    # If INLA is installed, then attach the Namespace (so that all the relevant functions are available)
    if (isTRUE(requireNamespace("INLA", quietly = TRUE))) {
        if (!is.element("INLA", (.packages()))) {
          attachNamespace("INLA")
        }
    }
    hyperpc1 <- list(prec = list(prior = "pc.prec", param = c(pc.u , pc.alpha)))
    hyperpc2 <- list(prec = list(prior = "pc.prec", param = c(pc.u , pc.alpha)),  phi = list(prior = 'pc', param = c(pc.u.phi , pc.alpha.phi)))
    if(sum(!data$region0 %in% colnames(Amat)) > 0){
       stop("Exist regions in data but not in the Amat.")
    }



    ##---------------------------------------------------------------------##
    ## Start INLA input processing
    ##---------------------------------------------------------------------##

    if(!is.unit.level){
        ##---------------------------------------------------------------------##
        ## Case 1: survey data, individual level
        ##---------------------------------------------------------------------##
        if(svy && is.null(direct.est)){
            design <- survey::svydesign(
                            ids = stats::formula(clusterVar),
                            weights = ~weights0,
                            strata = ~strata0,
                            data = data, 
                            ...)
            if(!is.null(timeVar)){
                mean <- survey::svyby(formula=~response0, by=~region0+time0, design=design, survey::svymean, drop.empty.groups=FALSE)
                time.i <- as.numeric(as.character(mean$time0))
            }else{
                mean <- survey::svyby(formula=~response0, by=~region0, design=design, survey::svymean, drop.empty.groups=FALSE)
            }
            name.i <- mean$region0            
            p.i <- mean$response0
            var.i <- mean$se^2
            if(responseType == "binary"){
                ht <- log(p.i/(1-p.i))
                ht.v <- var.i/(p.i^2*(1-p.i)^2)
                ht.prec <- 1/ht.v
            }else if(responseType == "gaussian"){
                ht <- p.i
                ht.v <- var.i
                ht.prec <- 1/ht.v
            }else{
                stop("responseType argument only supports binary or gaussian at the time.")
            }
            n <- y <- NA
        ##---------------------------------------------------------------------##
        ## Case 2: unweighted data, individual level
        ##---------------------------------------------------------------------##
        }else if(is.null(direct.est)){
            if(!is.null(timeVar)){
                mean <- stats::aggregate(response0 ~ region0+time0, data = data, FUN = function(x){c(mean(x), length(x), sum(x))}, drop = FALSE)    
                  name.i <- mean$region0
                  time.i <- as.numeric(as.character(mean$time0))
                  mean <- data.frame(mean[, -c(1:2)])
            }else{
                mean <- stats::aggregate(response0 ~ region0, data = data, FUN = function(x){c(mean(x), length(x), sum(x))}, drop = FALSE)
                name.i <- mean$region0
                mean <- data.frame(mean[, -1])
            }

            colnames(mean) <- c("mean", "n", "y") 
            p.i <- mean$mean
            var.i <- p.i * (1-p.i)/mean$n
            if(responseType == "binary"){
                ht <- log(p.i/(1-p.i))
                ht.v <- var.i/(p.i^2*(1-p.i)^2)
                ht.prec <- 1/ht.v
            }else if(responseType == "gaussian"){
                ht <- p.i
                ht.v <- var.i
                ht.prec <- 1/ht.v
            }else{
                stop("responseType argument only supports binary or gaussian at the time.")
            }
            n <- mean$n
            y <- mean$y
        ##---------------------------------------------------------------------##
        ## Case 3: survey data, area level direct estimates
        ##---------------------------------------------------------------------##      
        }else{
            p.i <- data$response0
            var.i <- data$var0
            if(responseType == "binary"){
                ht <- log(p.i/(1-p.i))
                ht.v <- var.i/(p.i^2*(1-p.i)^2)
                ht.prec <- 1/ht.v
            }else if(responseType == "gaussian"){
                ht <- p.i
                ht.v <- var.i
                ht.prec <- 1/ht.v
            }else{
                stop("responseType argument only supports binary or gaussian at the time.")
            } 
            n <- NA
            y <- NA
            time.i <- as.numeric(as.character(data$time0))
            name.i <- as.character(data$region0)
        }

        dat <- data.frame(
                          HT.est = p.i,
                          HT.var = var.i, 
                          HT.logit.est = ht,
                          HT.logit.var = ht.v, 
                          HT.logit.prec = ht.prec,  
                          n = n, 
                          y = y)   
    ##---------------------------------------------------------------------##
    ## Case 4: unit level data (counts by unit)
    ##---------------------------------------------------------------------##      
    }else{
        dat <- counts
        dat$y <- dat$response0
        name.i <- dat$region0
        if(!is.null(timeVar)) time.i <- as.numeric(as.character(dat$time0))
    }

    ##---------------------------------------------------------------------##
    ## End of INLA input preparation
    ##
    ## Check time variable and region names
    ##---------------------------------------------------------------------##    
    if(!is.null(timeVar)){
        dat$time <- time.i
        dat$time.struct <- dat$time.unstruct <- time.int <- dat$time - min(dat$time) + 1
    }   
    # make it consistent with map
    regnames <- as.character(name.i)
    dat$region <- as.character(name.i)
    dat$region.unstruct <- dat$region.struct <- dat$region.int <-  match(dat$region, rownames(Amat))
    if(!is.unit.level) dat$HT.logit.est[is.infinite(abs(dat$HT.logit.est))] <- NA  

    if(!is.null(timeVar)){
        dat <- dat[order(dat[, "time.struct"], dat[, "region.struct"]), ]
    }else{
        dat <- dat[order(dat[, "region.struct"]), ]
    }
    
    ##---------------------------------------------------------------------##
    ## End of time variable and region names checking
    ##
    ## Specify INLA formula
    ##---------------------------------------------------------------------##    
    # binary non-survey area-level model
    if(!svy && responseType == "binary" && !is.unit.level){
        formulatext <- "y ~ 1"
    # binary and continuous survey area-level model  
    # and continuous non-survey area-level model  
    }else if(!is.unit.level){
        formulatext <- "HT.logit.est ~ 1"
    # unit-level model    
    }else if(length(unique(data$strata0)) == 1){
        formulatext <- "y ~ 1"
    }else{
        formulatext <- "y ~ strata0 - 1"        
    }

    ##
    ## Specifying covariates (area-level) in the formula
    ##
    fixed <- NULL
    if(!is.null(X) && is.null(X.unit)){
        X <- data.frame(X)        
        fixed <- colnames(X)[colnames(X) != regionVar]
        colnames(X)[colnames(X) == regionVar] <- "region"
        if(sum(fixed %in% colnames(dat)) > 0){
            message("The following covariates exist in the input data frame. They are replaced with region-level covariates provided in X: ", fixed[fixed %in% colnames(dat)])
            dat <- dat[, !colnames(dat) %in% fixed]
        }
        dat <- merge(dat, X, by = "region", all = TRUE)
        formulatext <- paste(formulatext, " + ", paste(fixed, collapse = " + "))

    ##
    ## Specifying covariates (unit-level) in the formula
    ##
    }else if(is.nested && !is.null(X.unit)){
        formulatext <- paste(formulatext, " + ", paste(X.unit, collapse = " + "))

        X <- data.frame(X)        
        fixed <- colnames(X)[colnames(X) != regionVar]
        colnames(X)[colnames(X) == regionVar] <- "region"
        X <- X[, colnames(X) %in% c("region", X.unit)]
        for(j in colnames(dat)){
            if(!j %in% c("region", X.unit)){
                X[, j] <- dat[match(X$region, dat$region), j]
            }
        }

        X$y <- NA
        dat <- rbind(X, dat)
        out.index <- 1:dim(X)[1]

    # unit-level model without covariates   
    }else if(is.nested){
        X <- data.frame(region = colnames(Amat))        
        for(j in colnames(dat)){
            if(!j %in% c("region")){
                X[, j] <- dat[match(X$region, dat$region), j]
            }
        }

        X$y <- NA
        dat <- rbind(X, dat)
        out.index <- 1:dim(X)[1]

    }

    if(is.null(formula)){
       if(is.iid.space){
            formula <- paste(formulatext, "f(region.struct, model = 'iid', hyper = hyperpc1)", sep = "+")    
       }else{
            formula <- paste(formulatext, "f(region.struct, graph=Amat, model='bym2', hyper = hyperpc2, scale.model = TRUE)", sep = "+")    
       }
       ##
       ##  Constraints are not specified for the interactions.
       ##
       if(!is.null(timeVar)){
            formula <- paste(formula, "f(time.unstruct, model = 'iid', hyper = hyperpc1) + f(time.struct, model=tolower(time.model), hyper = hyperpc1, scale.model = TRUE)", sep = "+")
            if(type.st == 1){
                formula <- paste(formula, "f(region.int, model = 'iid', hyper = hyperpc1, group = time.int, control.group = list(model ='iid', hyper = hyperpc1))", sep = "+")
            }else if(type.st == 2){
                formula <- paste(formula, "f(region.int, model = 'besag', graph = Amat, scale.model = TRUE, param=hyperpc1, group = time.int, control.group = list(model ='iid'))", sep = "+")
            }else if(type.st == 3){
                formula <- paste(formula, "f(region.int, model = 'iid', hyper = hyperpc1, group = time.int, control.group = list(model =tolower(time.model), scale.model = TRUE))", sep = "+")
            }else{
                formula <- paste(formula, "f(region.int, model = 'besag', graph = Amat, scale.model = TRUE, hyper = hyperpc1, group = time.int, control.group = list(model =tolower(time.model), hyper = hyperpc1, scale.model = TRUE))", sep = "+")
            }
        }
        formula <- as.formula(formula)
    }else{
        formula <- as.formula(paste(formulatext, formula, sep = "+"))
    }   

    ##---------------------------------------------------------------------##
    ## End of INLA formula
    ##
    ## Specify INLA call
    ##---------------------------------------------------------------------##   

    if(smooth){
    if(is.unit.level && responseType == "binary"){
        fit <- INLA::inla(formula, family="betabinomial", Ntrials=dat$total, control.compute = list(dic = T, mlik = T, cpo = T, config = TRUE, return.marginals.predictor=TRUE), data = dat, control.predictor = list(compute = TRUE),  lincomb = NULL, quantiles = c((1-CI)/2, 0.5, 1-(1-CI)/2)) 
    
    }else if(is.unit.level && responseType == "gaussian"){
        fit <- INLA::inla(formula, family="gaussian", control.compute = list(dic = T, mlik = T, cpo = T, config = TRUE, return.marginals.predictor=TRUE), data = dat, control.predictor = list(compute = TRUE),  lincomb = NULL, quantiles = c((1-CI)/2, 0.5, 1-(1-CI)/2))  

    }else if(!svy && responseType == "binary"){
         fit <- INLA::inla(formula, family="binomial", Ntrials=n, control.compute = list(dic = T, mlik = T, cpo = T, config = save.draws, return.marginals.predictor=TRUE), data = dat, control.predictor = list(compute = TRUE, link=1),  lincomb = NULL, quantiles = c((1-CI)/2, 0.5, 1-(1-CI)/2)) 
    
    }else if(!svy && responseType == "gaussian"){
        fit <- INLA::inla(formula, family="gaussian", control.compute = list(dic = T, mlik = T, cpo = T, config = save.draws, return.marginals.predictor=TRUE), data = dat, control.predictor = list(compute = TRUE), control.family = list(hyper= list(prec = list(initial= log(1), fixed= TRUE))), lincomb = NULL, quantiles = c((1-CI)/2, 0.5, 1-(1-CI)/2)) 
    }else{
         fit <- INLA::inla(formula, family = "gaussian", control.compute = list(dic = T, mlik = T, cpo = T, config = save.draws, return.marginals.predictor=TRUE), data = dat, control.predictor = list(compute = TRUE), control.family = list(hyper= list(prec = list(initial= log(1), fixed= TRUE))), scale = dat$HT.logit.prec,  lincomb = NULL, quantiles = c((1-CI)/2, 0.5, 1-(1-CI)/2)) 
    }
    }else{
        fit <- NULL
    }
    
    ##---------------------------------------------------------------------##
    ## End of INLA model fits
    ##
    ## Preparing data frame to hold the estimates
    ##---------------------------------------------------------------------##   

    n <- max(dat$region.struct) 
    temp <- NA
    if(!is.unit.level){
        if(!is.null(timeVar)) {
            n <- n * max(dat$time.struct)
            temp <- dat$time[1:n]
        }
        proj <- data.frame(region=dat$region[1:n], time = temp, mean=NA, var=NA, median=NA, lower=NA, upper=NA, logit.mean=NA, logit.var=NA, logit.median=NA, logit.lower=NA, logit.upper=NA)        
    }else{
        if(!is.null(timeVar)) {
            temp <- 1:max(dat$time.struct)
        }
        proj <- expand.grid(region = colnames(Amat), time = temp, strata = unique(dat$strata0))
        proj <- cbind(proj, data.frame(mean=NA, var=NA, median=NA, lower=NA, upper=NA, logit.mean=NA, logit.var=NA, logit.median=NA, logit.lower=NA, logit.upper=NA)) 
    }
  
    ##---------------------------------------------------------------------##
    ## Start computing posterior marginals for the estimates
    ##---------------------------------------------------------------------##   

    if(smooth){
    for(i in 1:dim(proj)[1]){

        ##---------------------------------------------------------------------##
        ## Estimates: Area level model
        ##            use the top filled NA rows of input directly
        ##---------------------------------------------------------------------##   

        if(!is.unit.level){
            tmp <- matrix(INLA::inla.rmarginal(1e5, fit$marginals.linear.predictor[[i]]))

        ##---------------------------------------------------------------------##
        ## Estimates: Unit level model
        ##               
        ##---------------------------------------------------------------------##   
        }else{
            if(is.nested && !is.null(X.unit)){
                which <- which(dat[out.index, ]$region == proj$region[i] & dat[out.index, ]$strata0 == proj$strata[i])[1]
                which <- out.index[which]
            }else if(is.null(timeVar)){
                which <- which(dat$region == proj$region[i] & dat$strata0 == proj$strata[i])[1]
            }else{
                which <- which(dat$region == proj$region[i] & dat$strata0 == proj$strata[i] &
                               dat$time == proj$time[i])[1] 
            }
            if(is.na(which)) next
            tmp <- matrix(INLA::inla.rmarginal(1e5, fit$marginals.linear.predictor[[which]]))
        }
        
        ##---------------------------------------------------------------------##
        ## Estimates: Apply transformations
        ##---------------------------------------------------------------------##   
    
        if(!svy && responseType == "binary"){
            tmp2 <- tmp
            proj[i, "logit.mean"] <- mean(tmp2)
            proj[i, "logit.var"] <- var(tmp2)
            proj[i, "logit.lower"] <- quantile(tmp2, (1-CI)/2)
            proj[i, "logit.upper"] <- quantile(tmp2, 1-(1-CI)/2)
            proj[i, "logit.median"] <- median(tmp2)

            proj[i, "mean"] <- fit$summary.fitted.values[i, "mean"]
            proj[i, "var"] <- fit$summary.fitted.values[i, "sd"]^2
            proj[i, "lower"] <- fit$summary.fitted.values[i, 3]
            proj[i, "upper"] <- fit$summary.fitted.values[i, 5]
            proj[i, "median"] <- fit$summary.fitted.values[i, "0.5quant"]  
        }else{
            tmp2 <- apply(tmp, 2, FUN)
            proj[i, "mean"] <- mean(tmp2)
            proj[i, "var"] <- var(tmp2)
            proj[i, "lower"] <- quantile(tmp2, (1-CI)/2)
            proj[i, "upper"] <- quantile(tmp2, 1-(1-CI)/2)
            proj[i, "median"] <- median(tmp2)

            if(responseType == "binary"){
                proj[i, "logit.mean"] <-mean(tmp)
                proj[i, "logit.var"] <- var(tmp)
                proj[i, "logit.lower"] <- quantile(tmp, (1-CI)/2)
                proj[i, "logit.upper"] <- quantile(tmp, (1-CI)/2) 
                proj[i, "logit.median"] <- median(tmp)
            }
        }
      
    }

    ##---------------------------------------------------------------------##
    ## End of posterior marginals for the estimates
    ##  
    ## Start of posterior draws: preparing empty data frame
    ##---------------------------------------------------------------------##   


    if("strata" %in% colnames(proj)){
        draws.out <- proj[, c("region", "time", "strata")]
    }else{
        draws.out <- proj[, c("region", "time")]        
    }
    for(j in 1:nsim) draws.out[, paste0("sample:", j)] <- NA
    
   ##---------------------------------------------------------------------##
   ## Saving posterior draws 
   ##---------------------------------------------------------------------##   

    sampAll <- NULL
    if(save.draws){
        sampAll <- INLA::inla.posterior.sample(n = nsim, result = fit, intern = TRUE)
        for(i in 1:dim(draws.out)[1]){
            for(j in 1:nsim){
                draws.out[i, paste0("sample:", j)]  <- sampAll[[j]]$latent[i, 1]
                if(!include_time_unstruct && !is.null(timeVar)){
                    draws.out[i, paste0("sample:", j)]  <- draws.out[i, paste0("sample:", j)] - sampAll[[j]]$latent[paste0("time.unstruct:", draws.out$time[i, 1])]
                }
            }
        }
    }


   ##---------------------------------------------------------------------##
   ## Aggregation with posterior draws for the unit level model
   ##  From stratified results to overall
   ##---------------------------------------------------------------------##   

   # Aggregation with posterior samples
   if(is.unit.level && !is.null(weight.strata) && length(unique(data$strata0)) > 1){
     proj.agg <- expand.grid(region = colnames(Amat), time = temp)
     proj.agg <- cbind(proj.agg, data.frame(mean=NA, var=NA, median=NA, lower=NA, upper=NA, logit.mean=NA, logit.var=NA, logit.median=NA, logit.lower=NA, logit.upper=NA)) 
     if(is.null(sampAll)) sampAll <- INLA::inla.posterior.sample(n = nsim, result = fit, intern = TRUE)

     for(i in 1:dim(proj.agg)[1]){
        if(is.null(timeVar)){
            which <- which(weight.strata[, regionVar] == proj.agg$region[i])
        }else{
            which <-  which(weight.strata[, regionVar] == proj.agg$region[i] &
                            weight.strata[, timeVar] == proj.agg$time[i]) 
        }
        draws <- rep(0, nsim)
        for(j in 1:length(sampAll)){
            r <- sampAll[[j]]$latent[paste0("region.struct:", match(proj.agg$region[i], colnames(Amat))), ]
            if(!is.null(timeVar)) r <- r + sampAll[[j]]$latent[paste0("time.struct:", proj.agg$time[i]), ]
            if(!is.null(timeVar) && include_time_unstruct) r <- r + sampAll[[j]]$latent[paste0("time.unstruct:", proj.agg$time[i]), ]

            # only handling region-level covariates  
            if(!is.null(X)){
                for(xx in fixed){
                    slope = sampAll[[j]]$latent[paste0(xx, ":1"), ]
                    r <- r + slope * dat[which(dat$region == proj.agg$region[i])[1], xx]
                }
            } 
            for(s in stratalist){
                intercept = sampAll[[j]]$latent[paste0("strata0", s, ":1"), ]
                if(responseType == "binary"){
                    draws[j] <- draws[j] + expit(r + intercept) * weight.strata[which, s]
                }else if(responseType == "gaussian"){
                    draws[j] <- draws[j] + (r + intercept) * weight.strata[which, s]
                }
            } 
        }
        proj.agg[i, "mean"] <- mean(draws)
        proj.agg[i, "var"] <- var(draws)
        proj.agg[i, "lower"] <- quantile(draws, (1 - CI)/2)
        proj.agg[i, "upper"] <- quantile(draws, 1 - (1 - CI)/2)
        proj.agg[i, "median"] <- median(draws)
        if(responseType == "binary"){
            proj.agg[i, "logit.mean"] <- mean(logit(draws))
            proj.agg[i, "logit.var"] <- var(logit(draws))
            proj.agg[i, "logit.lower"] <- quantile(logit(draws), (1-CI)/2)
            proj.agg[i, "logit.upper"] <- quantile(logit(draws), 1-(1-CI)/2)
            proj.agg[i, "logit.median"] <- median(logit(draws))
        }
     }
    }else{
        proj.agg <- NULL
    }
    }else{
        proj.agg <- NULL
    }


   ##---------------------------------------------------------------------##
   ## Preparing output data frame
   ##---------------------------------------------------------------------##   

   if(!is.unit.level){
       # organize output nicer 
       HT <- dat[, !colnames(dat) %in% c("region.struct", "region.unstruct", "region.int", "time.struct", "time.unstruct", "time.int")]
       HT <- cbind(region=HT[, "region"], HT[, colnames(HT) != "region"])
       if("time" %in% names(HT)){
           HT <- cbind(region=HT[, "region"], 
                       time=HT[, "time"], 
                       HT[, !(colnames(HT) %in% c("region", "time"))])
       }
       for(i in 1:dim(HT)[2]) HT[is.nan(HT[,i]), i] <- NA
       if(sum(!is.na(HT$n)) == 0) HT <- HT[, colnames(HT) != "n"]
       if(sum(!is.na(HT$y)) == 0) HT <- HT[, colnames(HT) != "y"]
       if(sum(!is.na(proj$time)) == 0) proj <- proj[, colnames(proj) != "time"]
       if(responseType == "gaussian"){
            HT <- HT[, !colnames(HT) %in% c("HT.logit.est", "HT.logit.var", "HT.logit.prec")]
            proj <- proj[, !colnames(HT) %in% c("logit.mean", "logit.var", "logit.median", "logit.lower", "logit.upper")]
       }    
   }else{
        HT <- NULL
   }

   out <- list(HT = HT,
               smooth = proj, 
               smooth.overall = proj.agg, 
               fit = fit, 
               CI = CI,
               Amat = Amat,
               responseType = responseType,
               formula = formula, 
               msg = msg)
   if(save.draws){
    out$draws <- sampAll
    out$draws.est <- draws.out
   }
   class(out) <-  "SUMMERmodel.svy"

   return(out)
}


#' @export
#' @rdname smoothSurvey
fitGeneric <-  smoothSurvey
bryandmartin/SUMMER documentation built on April 9, 2024, 10:27 a.m.