R/optsize.R
In surveyplanning: Survey Planning Tools

Documented in optsize

#' Optimal sample size allocation
#'
#' @description The function computes optimal sample size allocation over strata.
#'
#' @section Details:
#' If \code{s2h} and \code{Rh} is not defined, the sample allocation will be calculated as proportional allocation (proportional to the population size).
#' If \code{Rh} is not defined, the sample allocation will be calculated as Neyman allocation.
#'
#' @param H The stratum variable. One dimensional object convertible to one-column \code{data.table}, variable name as character, or column number.
#' @param n Total sample size. One dimensional object with length one.
#' @param poph Population size in each stratum. One dimensional object convertible to one-column \code{data.table}, variable name as character, or column number.
#' @param s2h The expected population variance \eqn{S^2} for variables of interest in each stratum (optional). If not defined, it is assumed to be 1 in each stratum. Object convertible to \code{data.table}, variable name as character vector, or column numbers.
#' @param Rh The expected response rate in each stratum (optional). If not defined, it is assumed to be 1 in each stratum (full-response). Object convertible to one-column \code{data.table}, variable name as character, or column number.
#' @param deffh The expected design effect for the estimate of variable (optional). If not defined, it is assumed to be 1 for each variable in each stratum. If is defined, then variables is defined the same arrangement as \code{Yh}. Object convertible to \code{data.table}, variable name as character vector, or column numbers.
#' @param fullsampleh Variable for detection fully surveyed stratum (optinal). If not defined, it is assumed to be 1 in each stratum (full-response). Object convertible to one-column \code{data.table}, variable name as character, or column number.
#' @param dataset Optional survey data object convertible to \code{data.table} with one row for each stratum.
#'

#' @return  An object as \code{data.table}, with variables:\cr
#'   \code{H} - stratum, \cr
#'   \code{variable} - the name of variable for population variance \eqn{S^2}, \cr
#'   \code{s2h} - population variance \eqn{S^2}, \cr
#'   \code{Rh} - the expectedresponse rate, \cr
#'   \code{deffh} - the expected design effect, \cr
#'   \code{poph} - population size, \cr
#'   \code{deffh} - design effect, \cr
#'   \code{fullsampleh} - full sample indicator, \cr
#'   \code{nh} - sample size.
#'
#' @seealso \code{\link{expsize}}, \code{\link{dom_optimal_allocation}}
#'
#' @keywords surveysampling
#'
#' @examples
#' library("data.table")
#' data <- data.table(H = 1 : 3,
#'                    s2h=10 * runif(3),
#'                    s2h2 = 10 * runif(3),
#'                    poph = 8 * 1 : 3,
#'                    Rh = rep(1, 3),
#'                    dd = c(1, 1, 1))
#'
#' vars <- optsize(H = "H",
#'                 s2h = c("s2h", "s2h2"),
#'                 n = 10, poph = "poph",
#'                 Rh = "Rh",
#'                 fullsampleh = NULL,
#'                 dataset = data)
#' vars
#'
#' @import data.table stats
#' @export optsize
#'
optsize <- function(H, n, poph,
                    s2h = NULL,
                    Rh = NULL,
                    deffh = NULL,
                    fullsampleh = NULL,
                    dataset = NULL) {

  ### Checking
  if( length(n) != 1 | !any(n>0 | abs(n - round(n)) < .Machine$double.eps)) stop("'n' must be a integer value greater than 0")

  if(!is.null(dataset)) {
      dataset <- data.table(dataset)
      if(!is.null(H)) {
          if (min(H %in% names(dataset)) != 1) stop("'H' does not exist in 'dataset'!")
          if (min(H %in% names(dataset)) == 1) H <- dataset[, H, with = FALSE] }
      if(!is.null(s2h)) {
          if (min(s2h %in% names(dataset)) != 1) stop("'s2h' does not exist in 'dataset'!")
          if (min(s2h %in% names(dataset)) == 1) s2h <- dataset[, s2h, with = FALSE] }
      if(!is.null(poph)) {
          if (min(poph %in% names(dataset)) != 1) stop("'poph' does not exist in 'dataset'!")
          if (min(poph %in% names(dataset)) == 1) poph <- dataset[, poph, with = FALSE] }
      if(!is.null(Rh)) {
          if (min(Rh %in% names(dataset)) != 1) stop("'Rh' does not exist in 'dataset'!")
          if (min(Rh %in% names(dataset)) == 1) Rh <- dataset[, Rh, with = FALSE] }
      if(!is.null(deffh)) {
          if (min(deffh %in% names(dataset)) != 1) stop("'deffh' does not exist in 'dataset'!")
          if (min(deffh %in% names(dataset)) == 1) deffh <- dataset[, deffh, with = FALSE] }
      if(!is.null(fullsampleh)) {
          if (min(fullsampleh %in% names(dataset)) != 1) stop("'fullsampleh' does not exist in 'dataset'!")
          if (min(fullsampleh %in% names(dataset)) == 1) fullsampleh <- dataset[, fullsampleh, with = FALSE] }
    }

  # s2h
  if (is.null(s2h)) s2h <- rep(1, nrow(H))
  s2h <- data.table(s2h, check.names = TRUE)
  m <- ncol(s2h)
  if (any(is.na(s2h))) stop("'s2h' has unknown values")
  if (!all(sapply(s2h, is.numeric))) stop("'s2h' must be numeric values")
  if (is.null(names(s2h))) stop("'s2h' must be colnames")

  # H
  H <- data.table(H)
  if (nrow(H) != nrow(s2h)) stop("'H' length must be equal with 'S2h' row count")
  if (ncol(H) != 1) stop("'H' must be 1 column data.frame, matrix, data.table")
  if (any(is.na(H))) stop("'H' has unknown values")
  if (is.null(names(H))) stop("'H' must be colnames")

  # poph
  poph <- data.frame(poph)
  if (nrow(poph) != nrow(s2h)) stop("'poph' must be equal with 's2h' row count")
  if (ncol(poph) != 1) stop("'poph' must be vector or 1 column data.frame, matrix, data.table")
  poph <- poph[, 1]
  if (!is.numeric(poph)) stop("'poph' must be numerical")
  if (any(is.na(poph))) stop("'poph' has unknown values")

  # Rh
  if (is.null(Rh)) Rh <- rep(1, nrow(s2h))
  Rh <- data.frame(Rh)
  if (nrow(Rh) != nrow(s2h)) stop("'Rh' must be equal with 's2h' row count")
  if (ncol(Rh) != 1) stop("'Rh' must be vector or 1 column data.frame, matrix, data.table")
  Rh <- Rh[, 1]
  if (!is.numeric(Rh)) stop("'Rh' must be numerical")
  if (any(is.na(Rh))) stop("'Rh' has unknown values")

  # fullsampleh
  if (is.null(fullsampleh)) fullsampleh <- rep(0, length(Rh))
  fullsampleh <- data.frame(fullsampleh)
  if (nrow(fullsampleh) != nrow(s2h)) stop("'fullsampleh' must be equal with 's2h' row count")
  if (ncol(fullsampleh) != 1) stop("'fullsampleh' must be vector or 1 column data.frame, matrix, data.table")
  fullsampleh <- fullsampleh[, 1]
  if (!is.numeric(fullsampleh)) stop("'fullsampleh' must be numerical")
  if (any(is.na(fullsampleh))) stop("'fullsampleh' has unknown values")
  if (any(!(fullsampleh %in% c(0, 1)))) stop("'fullsampleh' must be two values: 0 or 1")


  # deffh
  if (!is.null(deffh)) {
    deffh <- data.table(deffh, check.names=TRUE)
    if (nrow(deffh) != nrow(s2h)) stop("'deffh' length must be equal with 's2h' row count")
    if (ncol(deffh) != m) stop("'deffh' and 'Yh' must be equal column count")
    if (any(is.na(deffh))) stop("'deffh' has unknown values")
    if (!all(sapply(deffh, is.numeric))) stop("'deffh' must be numeric values")
    if (is.null(names(deffh))) stop("'deffh' must be colnames")
   }

  variable <- pnh <- nh <- NULL

  resulth <- data.table(H, Rh = Rh, poph = poph, fullsampleh = fullsampleh)
  nh_cor <- n - resulth[get("fullsampleh") == 1, sum(poph)]
  if  (nh_cor < 0) stop("'n' need larger!")

  s2h <- data.table(H, s2h)
  ns2h <- names(s2h)
  s2h <- melt(s2h, id = c(names(H)))
  setnames(s2h, "value", "s2h")
  resulth <- merge(s2h, resulth, all = TRUE, by = names(H))

  if (!is.null(deffh)) {
      deffh <- data.table(H, deffh)
      setnames(deffh, names(deffh), ns2h)
      deffh <- melt(deffh, id = names(H))
      setnames(deffh, "value", "deffh")
      resulth <- merge(resulth, deffh, all = TRUE, by = c(names(H), "variable"))
  } else resulth[, deffh := 1]

  resulth[, pnh := ifelse(fullsampleh == 0, poph * sqrt(s2h * deffh / Rh), 0)]
  resulth[fullsampleh == 0, nh := nh_cor * pnh / sum(pnh), by = "variable"]
  resulth[fullsampleh == 1, nh := poph]
  resulth[, pnh := NULL]
  return(resulth)
}