R/eblupfh_cluster.R

Defines functions eblupfh_cluster

Documented in eblupfh_cluster

#' EBLUPs based on a Fay-Herriot Model with Cluster Information.
#'
#' @description This function gives the Empirical Best Linear Unbiased Prediction (EBLUP) or Empirical Best (EB) predictor based on a Fay-Herriot model with cluster information for non-sampled areas.
#'
#' @references
#' \enumerate{
#'  \item Rao, J. N., & Molina, I. (2015). Small area estimation. John Wiley & Sons.
#'  \item Anisa, R., Kurnia, A., & Indahwati, I. (2013). Cluster information of non-sampled area in small area estimation. E-Prosiding Internasional| Departemen Statistika FMIPA Universitas Padjadjaran, 1(1), 69-76.
#' }
#'
#' @param formula an object of class formula that contains a description of the model to be fitted. The variables included in the formula must be contained in the data.
#' @param data a data frame or a data frame extension (e.g. a tibble).
#' @param vardir vector or column names from data that contain variance sampling from the direct estimator for each area.
#' @param cluster vector or column name from data that contain cluster information.
#' @param method Fitting method can be chosen between 'ML' and 'REML'
#' @param mse_method MSE estimating method can be chosen between 'default' and 'jackknife'
#' @param maxiter maximum number of iterations allowed in the Fisher-scoring algorithm. Default is 100 iterations.
#' @param precision convergence tolerance limit for the Fisher-scoring algorithm. Default value is 0.0001.
#' @param scale scaling auxiliary variable or not, default value is FALSE.
#' @param print_result print coefficient or not, default value is TRUE.
#'
#' @returns The function returns a list with the following objects \code{df_res} and \code{fit}:
#' \code{df_res} a data frame that contains the following columns: \cr
#'    * \code{y} variable response \cr
#'    * \code{eblup} estimated results for each area \cr
#'    * \code{random_effect} random effect for each area \cr
#'    * \code{vardir} variance sampling from the direct estimator for each area \cr
#'    * \code{mse} Mean Square Error \cr
#'    * \code{cluster} cluster information for each area \cr
#'    * \code{rse} Relative Standart Error (%) \cr
#'
#' \code{fit} a list containing the following objects: \cr
#'    * \code{estcoef} a data frame with the estimated model coefficients in the first column (beta),
#'    their asymptotic standard errors in the second column (std.error),
#'    the t-statistics in the third column (tvalue) and the p-values of the significance of each coefficient
#'    in last column (pvalue) \cr
#'    * \code{model_formula} model formula applied \cr
#'    * \code{method} type of fitting method applied (`ML` or `REML`) \cr
#'    * \code{random_effect_var} estimated random effect variance \cr
#'    * \code{convergence} logical value that indicates the Fisher-scoring algorithm has converged or not \cr
#'    * \code{n_iter} number of iterations performed by the Fisher-scoring algorithm. \cr
#'    * \code{goodness} vector containing several goodness-of-fit measures: loglikehood, AIC, and BIC \cr
#'
#'
#' @details
#' The model has a form that is response ~ auxiliary variables.
#' where numeric type response variables can contain NA.
#' When the response variable contains NA it will be estimated with cluster information.
#'
#' @export
#' @examples
#' library(saens)
#'
#' m1 <- eblupfh_cluster(y ~ x1 + x2 + x3, data = mys, vardir = "var", cluster = "clust")
#' m1 <- eblupfh_cluster(y ~ x1 + x2 + x3, data = mys, vardir = ~var, cluster = ~clust)
#' @md

eblupfh_cluster <- function(formula, data, vardir, cluster, method = "REML", mse_method = 'jackknife',
                            maxiter = 100, precision = 1e-04, scale = FALSE,
                            print_result = TRUE) {
  y <- stats::model.frame(formula, data, na.action = NULL)[[1]]
  nonsample <- ifelse(is.na(y), TRUE, FALSE)
  # data hasil
  df_res <- data.frame(
    y = y,
    eblup = NA,
    cluster = NA,
    random_effect = NA,
    vardir = NA,
    g1 = NA,
    # g2 = NA, g3 = NA,
    mse = NA
  )


  if (any(is.na(y)) & missing(cluster)) {
    cli::cli_abort("variable {all.names(formula[2])} contains NA values, cluster information must be filled")
  }
  if (!any(is.na(y)) & !missing(cluster)) {
    cli::cli_warn("variable {all.names(formula[2])} does not contain na and cluster variable is filled")
  }

  if (!any(is.na(y))) {
    # eblup tanpa cluster
    datas <- data
  } else {
    # Extract vardir and cluster
    clust <- .get_variable(data, cluster)
    if (any(is.na(clust))) {
      cli::cli_abort("{cluster} variable contains NA values.")
    }
    # data sampled
    datas <- data[!nonsample, ]
    # data nonsampled
    datans <- data[nonsample, ]
    df_res$cluster <- clust
  }

  vardir_name <- vardir
  vardir <- .get_variable(datas, vardir)
  formuladata <- stats::model.frame(formula, datas, na.action = NULL)
  X <- stats::model.matrix(formula, datas)
  y <- as.matrix(formuladata[1])

  if (scale) {
    X <- scale(X)
    my_scale <- attr(X, "scaled:scale")
    my_center <- attr(X, "scaled:center")
  }

  # Cek pilihan metode
  if (!toupper(method) %in% c("ML", "REML")) {
    cli::cli_abort('"method" must be either ML or REML, not {method}.')
  }
  # Cek pilihan metode
  if (!tolower(mse_method) %in% c("default", "jackknife")) {
    cli::cli_abort('MSE estimation method must be either default or jackknife, not {mse_method}.')
  }
  # cek vardir mengandung NA atau tidak
  if (any(is.na(vardir))) {
    cli::cli_abort("Argument {vardir_name} contains NA values.")
  }
  # cek Auxiliary variabels mengandung NA atau tidak
  if (any(is.na(X))) {
    cli::cli_abort("Auxiliary variabels contains NA values.")
  }

  # inisialisasi untuk output
  result <- list(
    df_res = NA,
    fit = list(
      estcoef = NA,
      model_formula = formula,
      method = method,
      random_effect_var = NA,
      convergence = TRUE,
      n_iter = 0,
      goodness = NA
    )
  )


  # Inisialisasi variabel
  m <- nrow(X)
  p <- ncol(X)
  Xt <- t(X)
  k <- 0
  diff <- 1 + precision
  sigma2_u <- stats::median(vardir)
  R <- diag(vardir, m)
  Z <- diag(1, m)

  # Fisher scoring algorithm
  if (method == "ML") {
    while ((diff > precision) & (k < maxiter)) {
      # inisialisasi varians pengaruh acak (sigma2 u)
      G <- diag(sigma2_u[k + 1], m)
      # varians y
      V <- Z %*% G %*% t(Z) + R

      Vi <- solve(V)
      XtVi <- Xt %*% Vi

      # B = (X' V-1 X)-1 X' V-1 y
      # (Rao & Molina, 2015) equation 5.2.5
      Q <- solve(XtVi %*% X)
      BETA <- Q %*% XtVi %*% y

      # Partial derivative of log-likelihood with respect to sigma2_u
      # (Rao & Molina, 2015) after equation 5.2.16
      yXBeta <- y - X %*% BETA
      ViZtZ <- Vi %*% Z %*% t(Z)
      s <- -0.5 * sum(diag(ViZtZ)) - 0.5 * t(yXBeta) %*% (-ViZtZ %*% Vi) %*% (yXBeta)

      # Matrix of expected second derivaties of -log likehood
      # with respect to sigma2 u (Rao & Molina, 2015) equation 5.2.17
      Isigma2_u <- sum(diag(ViZtZ %*% ViZtZ)) / 2

      k <- k + 1
      # (Rao & Molina, 2015) equation 5.2.18
      sigma2_u[k + 1] <- sigma2_u[k] + s / Isigma2_u
      diff <- abs((sigma2_u[k + 1] - sigma2_u[k]) / sigma2_u[k])
    }
  } else if (method == "REML") {
    while ((diff > precision) & (k < maxiter)) {
      # inisialisasi varians pengaruh acak (sigma2 u)
      G <- diag(sigma2_u[k + 1], m)
      # varians y
      V <- Z %*% G %*% t(Z) + R

      Vi <- solve(V)
      XtVi <- Xt %*% Vi

      Q <- solve(XtVi %*% X)
      # (Rao & Molina, 2015) equation 5.2.21
      P <- Vi - Vi %*% X %*% Q %*% XtVi
      PZtZ <- P %*% Z %*% t(Z)
      Py <- P %*% y
      # Bentuk lain : Py <- Vi %*% (y - X %*% BETA)

      # Partial derivative of restricted log-likelihood with respect to sigma2_u
      # (Rao & Molina, 2015) after equation 5.2.21
      s <- -0.5 * sum(diag(PZtZ)) + 0.5 * t(y) %*% PZtZ %*% Py

      # Matrix of expected second derivaties of - restricted log likehood
      # with respect to sigma2 u (Rao & Molina, 2015) equation 5.2.22
      Isigma2_u <- sum(diag(PZtZ %*% PZtZ)) / 2

      k <- k + 1
      # (Rao & Molina, 2015) equation 5.2.18
      sigma2_u[k + 1] <- sigma2_u[k] + s / Isigma2_u
      diff <- abs(sigma2_u[k + 1] - sigma2_u[k]) / sigma2_u[k]
    }
  }

  sigma2_u <- max(sigma2_u[k + 1], 0)
  result$fit$random_effect_var <- sigma2_u

  # jika tidak konvergen
  if (k >= maxiter && diff >= precision) {
    result$fit$convergence <- FALSE
    cli::cli_alert_danger("After {maxiter} iterations, there is no convergence.")
    return(result)
  }


  # Beta estimation -------------------------------
  # random effect varians
  G <- diag(sigma2_u, m)
  # y varians
  V <- Z %*% G %*% t(Z) + R
  Vi <- solve(V)
  XtVi <- Xt %*% Vi
  Q <- solve(XtVi %*% X)
  BETA <- Q %*% XtVi %*% y

  # std error, pvalue beta, and eblup estimation --
  # varA <- 1 / Isigma2_u
  stderr_beta <- sqrt(diag(Q))
  zvalue <- BETA / stderr_beta
  pvalue <- 2 * stats::pnorm(abs(zvalue), lower.tail = FALSE)
  coef_est <- data.frame(BETA, stderr_beta, zvalue, pvalue)
  colnames(coef_est) <- c("beta", "Std.Error", "z-value", "p-value")

  Xbeta <- X %*% BETA
  resid <- y - Xbeta
  u <- (sigma2_u / (sigma2_u + vardir)) * resid
  u <- as.vector(u)
  eblup_est <- Xbeta + u

  # Goodness ------------------------------
  loglike <- -0.5 * (sum(log(2 * pi * (sigma2_u + vardir)) + (resid^2) / (sigma2_u + vardir)))
  AIC <- -2 * loglike + 2 * (p + 1)
  BIC <- -2 * loglike + (p + 1) * log(m)

  # MSE package sae -----------------------------------
  g1d <- g2d <- g3d <- mse2d <- rep(0, m)
  Bd <- vardir / (sigma2_u + vardir)
  SumAD2 <- sum(diag(Vi)^2)
  VarA <- 2 / SumAD2

  if (method == "ML") {
    b <- -1 * sum(diag(Q %*% (t((Vi %*% Vi) %*% X) %*% X))) / SumAD2
    for (d in 1:m) {
      g1d[d] <- vardir[d] * (1 - Bd[d])
      xd <- matrix(X[d, ], nrow = 1, ncol = p)
      g2d[d] <- (Bd[d]^2) * xd %*% Q %*% t(xd)
      g3d[d] <- (Bd[d]^2) * VarA / (sigma2_u + vardir[d])
      mse2d[d] <- g1d[d] + g2d[d] + 2 * g3d[d] - b * (Bd[d]^2)
    }
  } else if (method == "REML") {
    for (d in 1:m) {
      g1d[d] <- vardir[d] * (1 - Bd[d])
      xd <- matrix(X[d, ], nrow = 1, ncol = p)
      g2d[d] <- (Bd[d]^2) * xd %*% Q %*% t(xd)
      g3d[d] <- (Bd[d]^2) * VarA / (sigma2_u + vardir[d])
      mse2d[d] <- g1d[d] + g2d[d] + 2 * g3d[d]
    }
  }
  # MSE with matrix -----------------------------------
  df_res[!nonsample, "random_effect"] <- u
  df_res[!nonsample, "eblup"] <- eblup_est
  df_res[!nonsample, "g1"] <- g1d
  # df_res[!nonsample, "g2"] <- g2d
  # df_res[!nonsample, "g3"] <- g3d
  df_res[!nonsample, "vardir"] <- vardir
  df_res[!nonsample, "mse"] <- mse2d


  # Non sampled -----------------------------------------------
  if (sum(nonsample) >= 1) {
    m_ns <- sum(nonsample)
    # Xns <- model.matrix(formula, datans)
    Xns <- stats::model.matrix(formula, stats::model.frame(~., datans, na.action = stats::na.pass))
    if (scale) {
      Xns <- scale(Xns, center = my_center, scale = my_scale)
    }
    Xbeta_ns <- Xns %*% BETA

    vardir_ns <- .get_value(vardir, clust, nonsample)
    g1_ns <- g2_ns <- rep(0, m_ns)
    Bd_ns <- vardir_ns / (sigma2_u + vardir_ns)
    g1_ns <- vardir_ns * (1 - Bd_ns)
    u_ns <- .get_value(u, clust, nonsample)

    if(mse_method == 'default'){
      g3_ns <- .get_value(g3d, clust, nonsample)
      Vi_ns <- diag(1 / (sigma2_u + vardir_ns))
      XtVi_ns <- t(Xns) %*% Vi_ns
      Q_ns <- solve(XtVi_ns %*% Xns)

      for (i in 1:m_ns) {
        xd_ns <- matrix(Xns[i, ], nrow = 1, ncol = p)
        g2_ns[i] <- (Bd_ns[i]^2) * xd_ns %*% Q_ns %*% t(xd_ns)
      }

      if (method == "ML") {
        SumAD2_ns <- sum(diag(Vi_ns)^2)
        b <- -1 * sum(diag(Q_ns %*% (t((Vi_ns %*% Vi_ns) %*% Xns) %*% Xns))) / SumAD2_ns
        df_res[nonsample, "mse"] <- g1_ns + g2_ns + 2 * g3_ns - b * (Bd_ns^2)
      } else if (method == "REML") {
        df_res[nonsample, "mse"] <- g1_ns + g2_ns + 2 * g3_ns
      }
    }

    df_res[nonsample, "random_effect"] <- u_ns
    df_res[nonsample, "eblup"] <- Xbeta_ns + u_ns
    df_res[nonsample, "g1"] <- g1_ns
    df_res[nonsample, "vardir"] <- vardir_ns
    # df_res[nonsample, "g2"] <- g2_ns
    # df_res[nonsample, "g3"] <- g3_ns
    df_res$rse <- sqrt(df_res$mse) * 100 / df_res$eblup

    # MSE Jackknife ------------------------------------------------------------
    if(mse_method == 'jackknife'){
      for (i in 1:m) {
        datas_sub <- datas[-i, ]
        sae1_sub <- eblupfh(formula, vardir = vardir_name, data = datas_sub, print_result = FALSE)
        sigma2_u_sub <- sae1_sub$fit$random_effect_var
        beta1_sub <- matrix(sae1_sub$fit$estcoef$beta)
        gamma1_sub <- sigma2_u_sub / (sigma2_u_sub + vardir)

        y <- stats::model.frame(formula, datas, na.action = NULL)[[1]]
        X <- stats::model.matrix(formula, datas)

        g1_sub <- gamma1_sub * vardir

        resid <- y - X %*% beta1_sub
        u1_sub <- (sigma2_u_sub / (sigma2_u_sub + vardir)) * resid
        u1_sub <- as.vector(u1_sub)

        y_cap1_sub <- X %*% beta1_sub + u1_sub

        u1_ns <- .get_value(u1_sub, clust, nonsample)
        g1_ns <- .get_value(g1_sub, clust, nonsample)

        df_hasil <- data.frame(nonsample, y_cap = NA, g1 = NA)
        df_hasil[!nonsample, 'y_cap'] <- y_cap1_sub
        y_cap <-  Xns %*% beta1_sub + u1_ns

        df_hasil[nonsample, 'y_cap'] <- y_cap
        df_hasil[!nonsample, 'g1'] <- g1_sub
        df_hasil[nonsample, 'g1'] <- g1_ns

        y_cap_sub <- df_hasil$y_cap
        g1_sub <- df_hasil$g1

        M2_sub <- (y_cap_sub - df_res$eblup)^2
        M2_sub <- M2_sub + M2_sub
        M1_sub <- g1_sub - df_res$g1
        M1_sub <- M1_sub + M1_sub

      }
      M2.cap <- M2_sub * (m - 1) / m
      M1 <- M1_sub * (m - 1) / m
      M1.cap <- df_res$g1 - M1
      MSE.cap <- M1.cap + M2.cap

      df_res$mse <- MSE.cap
      df_res$rse <- sqrt(MSE.cap) * 100 / df_res$eblup
    }
    # ------------------------------------------------------------
  }

  df_res$g1 <- NULL
  result$df_res <- df_res
  result$fit$estcoef <- coef_est
  result$fit$goodness <- c(loglike = loglike, AIC = AIC, BIC = BIC)
  result$fit$n_iter <- k
  class(result) <- "eblupres"

  if (print_result) {
    cli::cli_alert_success("Convergence after {.orange {k}} iterations")
    cli::cli_alert("Method : {method}")
    cli::cli_h1("Coefficient")
    stats::printCoefmat(result$fit$estcoef, signif.stars = TRUE)
  }

  return(invisible(result))
}

Try the saens package in your browser

Any scripts or data that you put into this service are public.

saens documentation built on April 4, 2025, 4:43 a.m.