R/survey_sim_est.R
In surveySimR: Estimation of Population Total under Complex Sampling Design

Documented in survey_sim_est

#' Estimation of Finite Population Total under Complex Sampling Design Viz. SRSWOR
#'
#' This function evaluates and compares the performance of three estimators—Horvitz-Thompson (HT), Ratio, and Regression-for estimating the finite population total using repeated sampling and simulation under Simple Random Sampling Without Replacement (SRSWOR).
#'
#' @description
#' Sample surveys use scientific methods to draw inferences about population parameters by observing a representative part of the population, called sample.SRSWOR (Simple Random Sampling Without Replacement) is one of the most widely used probability sampling designs,wherein every unit has an equal chance of being selected and units are not repeated.This function draws multiple SRSWOR samples from a finite population and estimates the population parameter i.e. total of HT, Ratio, and Regression estimators.Repeated simulations (e.g., 500 times) are used to assess and compare estimators using metrics such as percent relative bias (%RB),percent relative root means square error (%RRMSE).
#'
#' @param Y Numeric vector. The study variable for which the population total is to be estimated.
#' @param X Numeric vector. The auxiliary variable used for ratio and regression estimators.
#' @param n Integer. Sample size to be drawn from the population in each simulation (default is 40).
#' @param SimNo Integer. Number of simulations or iterations to be performed (default is 500).
#' @param seed Integer. Random seed for reproducibility (default is 123).
#'
#' @return A data frame with the following columns:
#' - Est_Total: Estimated population total of HT, Ratio, and Regression estimators.
#' - RB: Relative Bias (%) of each estimator.
#' - RRMSE: Relative Root Means Square Error (%) of each estimator.
#' - Skewness: Skewness of estimator distributions.
#' - Kurtosis: Kurtosis of estimator distributions.
#' - Coverage: Coverage probability (within ±1.96 SE of true total) of each estimator.
#' - PopVar: Theoretical variance of each estimator.
#' - EmpVar: Empirical variance from simulations.
#' - EstVar: Average estimate of variances across simulations.
#' - CV: Coefficient of variation (%) of each estimator.
#'
#' @references
#' 1. Cochran, W. G. (1977). \emph{Sampling Techniques, 3rd Edition}. New York: John Wiley & Sons, Inc.
#'
#' 2. Singh, D. and Chaudhary, F.S. (1986). \emph{Theory and Analysis of Sample Survey Designs}. New York: John Wiley & Sons, Inc.
#'
#' 3. Sukhatme, P.V., Sukhatme, B.V., Sukhatme, S. and Asok, C. (1984). \emph{Sampling Theory of Surveys with Applications}. Iowa State University Press, Ames and Indian Society of Agricultural Statistics, New Delhi.
#'
#' 4. Särndal, C.-E., Swensson, B., & Wretman, J. (1992). Model Assisted Survey Sampling. Springer.
#' @examples
#' set.seed(101)
#' N <- 400
#' X <- runif(N, 5, 15)
#' Y <- 10 + 2 * X + rnorm(N, sd = 2)
#' survey_sim_est(Y, X, n = 40, SimNo = 500)
#'
#' @export
#' @importFrom moments skewness kurtosis
#' @importFrom stats cor sd var
#'
survey_sim_est <- function(Y, X, n = 40, SimNo = 500, seed = NULL) {
  if (length(Y) != length(X)) stop("Y and X must be of the same length.")

  if(!is.null(seed)){
    set.seed(seed)
  }

  N <- length(Y)
  Y.total <- sum(Y)
  X.total <- sum(X)
  True.Total <- rep(Y.total, SimNo)

  data <- data.frame(ID = 1:N, Y = Y, X = X)

  est1 <- est2 <- est3 <- var_est1 <- var_est2 <- var_est3 <- numeric(SimNo)

  for (h in 1:SimNo) {
    set.seed(h)
    sa <- sample(1:N, n, replace = FALSE)
    samp <- data[sa, ]
    di <- rep(N / n, n)

    y_samp <- samp$Y
    x_samp <- samp$X
    s2_y <- var(y_samp)
    s2_x <- var(x_samp)
    rho_hat <- cor(y_samp, x_samp)
    R_hat <- mean(y_samp) / mean(x_samp)

    # HT Estimator
    est1[h] <- sum(di * y_samp)
    var_est1[h] <- N^2 * ((1 / n) - (1 / N)) * s2_y

    # Ratio Estimator
    ratio1 <- di * y_samp
    ratio2 <- di * x_samp
    est2[h] <- (sum(ratio1) / sum(ratio2)) * X.total
    var_est2[h] <- N^2 * ((1 / n) - (1 / N)) * (s2_y + (R_hat^2 * s2_x) - (2 * R_hat * rho_hat * sqrt(s2_y * s2_x)))

    # Regression Estimator
    b_hat <- rho_hat * sqrt(s2_y) / sqrt(s2_x)
    est3[h] <- est1[h] + b_hat * (X.total - sum(ratio2))
    var_est3[h] <- N^2 * ((1 / n) - (1 / N)) * s2_y * (1 - rho_hat^2)
  }

  # Population-level parameters
  pop_s2_y <- var(Y)
  pop_s2_x <- var(X)
  R_pop <- mean(Y) / mean(X)
  rho_pop <- cor(Y, X)

  var_HT <- N^2 * ((1 / n) - (1 / N)) * pop_s2_y
  var_ratio <- N^2 * ((1 / n) - (1 / N)) * (pop_s2_y + R_pop^2 * pop_s2_x - 2 * R_pop * rho_pop * sqrt(pop_s2_y * pop_s2_x))
  var_reg <- N^2 * ((1 / n) - (1 / N)) * pop_s2_y * (1 - rho_pop^2)

  # Evaluation metrics
  RB <- 100 * colMeans(cbind(est1, est2, est3) - True.Total) / mean(True.Total)
  RRMSE <- 100 * sqrt(colMeans((cbind(est1, est2, est3) - True.Total)^2)) / mean(True.Total)
  CV <- 100 * apply(cbind(est1, est2, est3), 2, sd) / colMeans(cbind(est1, est2, est3))
  skew <- apply(cbind(est1, est2, est3), 2, moments::skewness)
  kurt <- apply(cbind(est1, est2, est3), 2, moments::kurtosis)
  emp_var <- apply(cbind(est1, est2, est3), 2, var)
  est_var <- c(mean(var_est1), mean(var_est2), mean(var_est3))
  coverage <- colMeans(abs(cbind(est1, est2, est3) - True.Total) <= 1.96 * sqrt(cbind(var_est1, var_est2, var_est3)))

  result <- data.frame(
    Est_Total = c(mean(est1), mean(est2), mean(est3)),
    RB = RB,
    RRMSE = RRMSE,
    Skewness = skew,
    Kurtosis = kurt,
    Coverage = coverage,
    PopVar = c(var_HT, var_ratio, var_reg),
    EmpVar = emp_var,
    EstVar = est_var,
    CV = CV
  )

  rownames(result) <- c("HT", "Ratio", "Regression")
  return(round(result, 3))
}