R/robsurvey.R

Defines functions weighted_median weighted_quantile weighted_mad rht_control weighted_total weighted_mean weighted_mean_huber weighted_total_huber svymean_huber svytotal_huber weighted_mean_trimmed weighted_total_trimmed svymean_trimmed svytotal_trimmed weighted_mean_winsorized weighted_total_winsorized svymean_winsorized svytotal_winsorized weighted_line weighted_median_line weighted_median_ratio

Documented in rht_control svymean_huber svymean_trimmed svymean_winsorized svytotal_huber svytotal_trimmed svytotal_winsorized weighted_line weighted_mad weighted_mean weighted_mean_huber weighted_mean_trimmed weighted_mean_winsorized weighted_median weighted_median_line weighted_median_ratio weighted_quantile weighted_total weighted_total_huber weighted_total_trimmed weighted_total_winsorized

#' robsurvey: Robust survey statistics.
#'
#' The package robsurvey is a collection of functions for robust survey
#' statistics.
#'
#' @section robsurvey functions:
#' robust Horvitz-Thompson M-estimator of mean and total in \code{svymean_huber()}
#' and \code{svytotal_huber()}, robust trimmed Horvitz-Thompson estimator of mean
#' and total in \code{svymean_trimmed()} and \code{svytotal_trimmed()}, robust winsorized
#' Horvitz-Thompson estimator of mean and total in \code{svymean_winsorized()} and
#' \code{svytotal_winsorized()}, weighted median estimator in \code{weighted_median()},
#' weighted quantile estimator in \code{weighted_quantile()}, weighted median
#' absolute deviation in \code{weighted_mad()}, weighted mean and total
#' estimators in \code{weighted_mean()} and \code{weighted_total()}.
#'
#' @references
#' Hulliger, B. (1995). Outlier Robust Horvitz-Thompson Estimators.
#' Survey Methodology, 21, 79 - 87.
#'
#' Hulliger, B. (2011). Main Results of the AMELI Simulation Study
#' on Advanced Methods for Laeken Indicators. In Proceedings of
#' NTTS2011, Brussels.
#'
#' @docType package
#' @name robsurvey
NULL



#' Weighted median with weighted interpolation
#'
#' \code{weighted_median} computes a weighted median where the
#' exact location corresponds exactly to a cumulative weight of 0.5.
#' This yields a symmetric median.
#'
#' Note that the \code{weighted_median} function delivers a symmetric
#' median while the \code{weighted_quantile} function with probability
#' 0.5 delivers the lower median. Hence, the results of these two
#' functions will generally differ.
#'
#' @param x a numeric vector whose weighted sample median is wanted
#' @param w a numeric vector of weights
#' @param na.rm a logical value indicating whether \code{NA} values should be
#'        stripped before the computation proceeds.
#' @return weighted sample median
#' @examples
#' x <- c(0.1, 0.35, 0.05, 0.1, 0.15, 0.05, 0.2)
#' weighted_median(x, x)
#' @seealso \code{\link{weighted_quantile}}
#' @export weighted_median
weighted_median <- function(x, w, na.rm=FALSE) {
  # checking and dealing with missingness
  if(missing(w)) stop('Argument w (weights) is missing, with no default.')
  sel <- is.finite(x) & is.finite(w)
  if (sum(sel) < length(x) & na.rm==FALSE) {
    stop("There are missing values in x or w. \n")
  }
  x <- x[sel]
  w <- w[sel]

  ord <- order(x)
  w <- w[ord]
  x <- x[ord]

  n <- length(x)
  cumw <- cumsum(w) / sum(w)

  # warning if one of the weights accounts for half of total weight
  if(max(w)/sum(w) > 0.5) {
    message("\n Dominance of one observation!\n")
  }

  lower.k <- max(which(cumw <= 0.5))
  upper.k <- min(which(cumw > 0.5))

  if (cumw[lower.k] < 0.5) return(x[upper.k])
  else return(0.5 * x[lower.k] + 0.5 * x[upper.k])

}



#' Weighted lower sample quantiles
#'
#' \code{weighted_quantile} computes the weighted lower sample quantile
#'
#' Weighted lower quantiles are computed using an algorithm with \eqn{O(n*log(n))}
#' in worst-case time. There exist superior algorithms; see Cormen et al.
#' (2009, Problem 9.2).
#'
#' @param x a numeric vector whose weighted sample quantiles are wanted
#' @param w a numeric vector of weights
#' @param probs a numeric vector of probabilities with values in [0,1]
#' @param na.rm a logical value indicating whether \code{NA} values should be
#'        stripped before the computation proceeds.
#' @return Weighted sample quantiles
#' @examples
#' x <- c(0.1, 0.35, 0.05, 0.1, 0.15, 0.05, 0.2)
#' weighted_quantile(x, x, probs = c(0.25, 0.5, 0.75))
#' @references Cormen,T.H., Leiserson, C.E., Rivest, R.L., and Stein, C. (2009):
#'    Introduction to Algorithms, 3rd ed., Cambridge: MIT Press.
#' @seealso \code{\link{weighted_median}}
#' @export weighted_quantile
#' @importFrom stats na.omit
#' @useDynLib robsurvey wquantile
weighted_quantile <- function(x, w, probs, na.rm = FALSE){
   if(missing(w)) stop('Argument w (weights) is missing, with no default.')
   if (is.factor(x) || is.factor(w) || is.data.frame(x)){
      stop("Arguments 'x' and 'w' must be numeric vectors\n")
   }
   n <- length(x); nw <- length(w)
   if (nw != n) stop("Vectors 'x' and 'w' are not of the same dimension\n")
   if (n == 0){
      return(NA)
   }
   dat <- cbind(x, w)
   if (na.rm){
      dat <- na.omit(dat)
   }else if(any(is.na(dat))) {
      return(NA)
   }
   if (any(probs < 0) | any(probs > 1)){
      stop("Argument 'probs' must satisfy (elementwise): 0 <= probs <= 1!\n")
   }
   res <- NULL
   for (i in 1:length(probs)){
      tmp <- .C("wquantile", x = as.double(dat[, 1]), w = as.double(dat[, 2]),
                probs = as.double(probs[i]), q = as.double(numeric(1)),
                n = as.integer(n))
      res <- c(res, tmp$q)
   }
   return(res)
}



#' Weighted median absolute deviation from the median (MAD)
#'
#' \code{weighted_mad} computes weighted median absolute deviation from the
#' weighted median
#'
#' The weighted MAD is computed as the (normalized) weighted median of the
#' absolute deviation from the weighted median; the median is computed as the
#' weighted lower sample median (see \code{\link{weighted_median}}); the MAD
#' is normalized to be an unbiased estimate of scale at the Gaussian core model.
#'
#' @param x a numeric vector
#' @param w a numeric vector of weights
#' @param na.rm a logical value indicating whether \code{NA} values should be
#'        stripped before the computation proceeds.
#' @param constant (scale factor, default: 1.4826)
#' @return Weighted median absolute deviation from the (weighted) median
#' @examples
#' x <- c(0.1, 0.35, 0.05, 0.1, 0.15, 0.05, 0.2)
#' weighted_mad(x, x)
#' @seealso \code{\link{weighted_median}}
#' @export weighted_mad
#' @importFrom stats na.omit
#' @useDynLib robsurvey wmad
weighted_mad <- function(x, w, na.rm = FALSE, constant = 1.4826){
   if(missing(w)) stop('Argument w (weights) is missing, with no default.')
   if (is.factor(x) || is.factor(w) || is.data.frame(x)){
      stop("Arguments 'x' and 'w' must be numeric vectors\n")
   }
   n <- length(x); nw <- length(w)
   if (nw != n) stop("Vectors 'x' and 'w' are not of the same dimension\n")
   if (n <= 1){
      return(NA)
   }
   dat <- cbind(x, w)
   if (na.rm){
      dat <- na.omit(dat)
   }else if(any(is.na(dat))) {
      return(NA)
   }
   tmp <- .C("wmad", x = as.double(dat[, 1]), w = as.double(dat[, 2]),
             mad = as.double(numeric(1)), n = as.integer(n))
   return(tmp$mad * constant)
}



#' Control function for M-estimation (tuning parameters etc.)
#'
#' This function is called internally.
#'
#' Tuning parameters for \code{\link{weighted_mean_huber}},
#' \code{\link{weighted_total_huber}}, \code{\link{svymean_huber}},
#' \code{\link{svytotal_huber}}.
#'
#' @param acc numeric tolerance, stoping rule in the iterative
#' updating scheme (default: \code{1e-5})
#' @param maxit maximum number of updating iterations
#' @param psi psi-function (\code{Huber} or \code{asymHuber})
#' @param ... additional arguments
#' @return List
#' @export rht_control
rht_control <- function(acc = 1e-5, maxit = 100, psi = "Huber", ...){
   if(!(psi %in% c("Huber", "asymHuber"))) stop("Function 'psi' must be
      either 'Huber' or 'asymHuber'\n")
   psi0 <- switch(psi,
      "Huber" = 0L,
      "asymHuber" = 1L)
   list(acc = unname(acc), maxit = unname(maxit), psi = unname(psi0))
}



#' @name wgtmeantotal
#' @aliases weighted_total
#' @aliases weighted_mean
#'
#' @title Weighted total and mean (Horvitz-Thompson and Hajek estimators)
#'
#' @description Weighted total and mean (Horvitz-Thompson and Hajek estimators)
#'
#' @details -
#'
#' @note \code{wgtmeantotal} is a generic name for the functions documented.
#'
#' @param x a numeric vector
#' @param w a numeric vector of weights
#' @param na.rm a logical value indicating whether \code{NA} values
#' should be stripped before the computation proceeds.
#' @return Estimate (scalar)
#'
#' @rdname wgtmeantotal
#' @examples
#' x <- c(0.1, 0.35, 0.05, 0.1, 0.15, 0.05, 0.2)
#' weighted_total(x, x)
#' @export weighted_total
#' @importFrom stats na.omit
weighted_total <- function(x, w, na.rm = FALSE){
   if(missing(w)) stop('Argument w (weights) is missing, with no default.')
   if (is.factor(x) || is.factor(w) || is.data.frame(x)){
      stop("Arguments 'x' and 'w' must be numeric vectors\n")
   }
   n <- length(x); nw <- length(w)
   if (nw != n) stop("Vectors 'x' and 'w' are not of the same dimension\n")
   if (n == 0){
      return(NA)
   }
   dat <- cbind(x, w)
   if (na.rm){
      dat <- na.omit(dat)
   }else if(any(is.na(dat))) {
      return(NA)
   }
   return(sum(dat[, 1] * dat[, 2]))
}
#' @rdname wgtmeantotal
#' @examples
#' x <- c(0.1, 0.35, 0.05, 0.1, 0.15, 0.05, 0.2)
#' weighted_mean(x, x)
#' @export weighted_mean
#' @importFrom stats na.omit
weighted_mean <- function(x, w, na.rm = FALSE){
   if(missing(w)) stop('Argument w (weights) is missing, with no default.')
   if (is.factor(x) || is.factor(w) || is.data.frame(x)){
      stop("Arguments 'x' and 'w' must be numeric vectors\n")
   }
   n <- length(x); nw <- length(w)
   if (nw != n) stop("Vectors 'x' and 'w' are not of the same dimension\n")
   if (n == 0){
      return(NA)
   }
   dat <- cbind(x, w)
   if (na.rm){
      dat <- na.omit(dat)
   }else if(any(is.na(dat))) {
      return(NA)
   }
   return(sum(dat[ ,1] * dat[, 2]) / sum(dat[, 2]))
}



#' @name huberwgt
#' @aliases weighted_mean_huber
#' @aliases weighted_total_huber
#' @aliases svymean_huber
#' @aliases svytotal_huber
#'
#' @title Huber M-estimators of the weighted mean and weighted total
#'
#' @description Weighted Huber M-estimators of the mean and total are available in two forms:
#' \itemize{
#'    \item \strong{bare-bone} functions: \code{weighted_mean_huber} and
#'	 \code{weighted_total_huber},
#'    \item estimation \strong{methods}: \code{svymean_huber} and
#'	 \code{svytotal_huber} (incl. variance estimation
#'	 based on the functionality of the \pkg{survey} package).
#' }
#'
#' @details \describe{
#'    \item{\emph{Overview}}{
#'    Robust M-estimator of the Horvitz--Thompson total or the Hajek mean
#'	 \itemize{
#'	 \item bare-bone functions: return the estimate (no variance estimation)
#'	 \item estimation methods on the basis of \pkg{survey} (incl. variance estimation)
#'	 }
#'    }
#'    \item{\emph{Type}}{
#'    Two \code{type}s of estimation methods are available:
#'	 \describe{
#'	    \item{\code{rht}}{(robust) Horvitz-Thompson M-estimator of the
#'	    total/mean
#'	    }
#'	    \item{\code{rwm}}{(robust) weighted mean estimator of
#'	    a Hajek-type estimator of the mean.
#'	    }
#'	 }
#'    If the study variable \code{x} is positively correlated with the inclusion
#'    probabilities, type \code{"rht"} tends to be superior.
#'    }
#'    \item{\emph{Scale}}{
#'    M-estimators of location are not scale invariant. The unknown scale is
#'    estimated simultaneously with the estimate of location (mean or total) as
#'    the weighted median absolute deviation from the weighted median (MAD, see
#'    \code{\link{weighted_mad}}).
#'
#'    }
#'    \item{\emph{Variance}}{
#'    Variance estimates of the mean or total estimator are computed as first-order
#'    linearization using the design-based-estimation capabilities available
#'    in package \pkg{survey}.
#'    }
#'    \item{\emph{Tuning}}{
#'    Additional arguments can be passed (via \dots) to specify the control
#'    parameters (e.g. number of iterations, psi-function, etc.); see
#'    \code{\link{rht_control}} for details.
#'    }
#'    \item{\emph{Domain estimation}}{
#'    Estimates for domains can be obtained using the \link[survey]{svyby}
#'    wrapper in the \pkg{survey} package (see examples).
#'    }
#' }
#'
#' @section Utility functions:
#' For the methods \code{svymean_huber} and \code{svytotal_huber}, the following
#' utility functions can be used
#' \itemize{
#'    \item \code{summary} gives a summary of the estimation properties
#'    \item \code{\link{robweights}} retrieves the robustness weights
#'    \item \code{coef}, \code{vcov}, \code{residuals}, and \code{fitted}
#'	 retrieve the estimate, variance, residuals and fitted
#'	 values, respectively
#' }
#'
#' @note \code{huberwgt} is a generic name for the functions documented.
#'
#' @param x a numeric vector (\code{weighted.[total/mean].huber} or
#' \code{weighted.[total/mean].huber}); a formula object or variable
#' name (\code{svymean_huber} or \code{svytotal_huber})
#' @param w a numeric vector of weights
#' @param design a \code{survey.design} object (see \link[survey]{svydesign}
#' in \pkg{survey})
#' @param k a robustness tuning constant, \eqn{k} in \eqn{[0, \infty)}
#' @param type type of estimator: \code{"rht"} (default) or \code{"rwm"}
#' @param info logical (default: \code{FALSE}); if \code{TRUE} further
#' estimation details are returned
#' @param na.rm a logical value indicating whether \code{NA} values should
#' be stripped before the computation proceeds.
#' @param ... additional arguments passed to the control object
#' (see \code{\link{rht_control}})
#'
#' @return \itemize{
#'    \item An estimate (scalar) for \code{weighted.[total/mean].huber}
#'    (unless \code{info=TRUE})
#'    \item An object of class \code{svystat.rob} for functions of the type
#'    \code{msvy[total/mean]}, i.e. a list including the following components:
#'    \code{characteristic}, \code{estimator}, \code{estimate}, \code{variance},
#'    \code{robust}, \code{optim}, \code{residuals}, \code{model}, \code{design},
#'    and \code{call}.
#' }
#'
#' @references Hulliger, B. (1995). Outlier Robust Horvitz-Thompson Estimators,
#' \emph{Survey Methodology} 21(1): 79-87.
#'
#' @seealso \code{\link{svymean_trimmed}}, \code{\link{svytotal_trimmed}},
#' \code{\link{svymean_winsorized}}, \code{\link{svytotal_winsorized}},
#' \code{\link{weighted_mean_trimmed}}, \code{\link{weighted_total_trimmed}}
#' \code{\link{weighted_mean_winsorized}}, \code{\link{weighted_total_winsorized}}
#'
#' @rdname huberwgt
#' @export weighted_mean_huber
#' @importFrom stats na.omit
#' @useDynLib robsurvey rwlslm
weighted_mean_huber <- function(x, w, k = 1.5, type = "rht", info = FALSE,
   na.rm = FALSE, ...){
   if(missing(w)) stop('Argument w (weights) is missing, with no default.')
   ctrl <- rht_control(...)
   if(!(type %in% c("rht", "rwm"))) stop("Argument 'type' must be either 'rht' or 'rwm'\n")
   n <- length(x)
   if (length(w) != n) stop("Vectors 'x' and 'w' are not of the same dimension\n")
   if (is.factor(x) || is.factor(w) || is.data.frame(x)){
      stop("Arguments 'x' and 'w' must be numeric vectors\n")
   }
   n <- length(x); nw <- length(w)
   if (nw != n) stop("Vectors 'x' and 'w' are not of the same dimension\n")
   if (n == 0){
      return(NA)
   }
   dat <- cbind(x, w)
   if (na.rm){
      dat <- na.omit(dat)
      n <- nrow(dat)
   }else if(any(is.na(dat))) {
      return(NA)
   }
   x <- switch(type,
      "rht" = rep(1, n),
      "rwm" = mean(dat[,2]) / dat[,2])
   # estimate by irwls
   tmp <- .C("rwlslm", x = as.double(x), y = as.double(dat[, 1]),
      w = as.double(dat[, 2]), resid = as.double(numeric(n)),
      infl = as.double(numeric(n)), robwgt = as.double(numeric(n)),
      n = as.integer(n), p = as.integer(1), k = as.double(k),
      beta = as.double(numeric(1)), scale = as.double(numeric(1)),
      maxit = as.integer(ctrl$maxit), tol = as.double(ctrl$acc),
      psi = as.integer(ctrl$psi))
   if(tmp$maxit == 0) message("IRWLS algorithm did not converge!\n")
   if(info){
      res <- list(characteristic = "mean",
	 estimator = paste0("M-estimator (", type, ", ", ifelse(ctrl$psi == 0,
	    "Huber", "asymHuber"), ", k = ", tmp$k,")"),
	 estimate = tmp$beta,
	 scale = tmp$scale,
	 optim = ifelse(tmp$maxit == 0, paste0("did NOT converge in ",
	 ctrl$maxit ," iterations"), paste0("converged in ", tmp$maxit,
	 " iterations")),
	 robweights = tmp$robwgt)
   }else{
      res <- tmp$beta
   }
   return(res)
}
#' @rdname huberwgt
#' @export weighted_total_huber
weighted_total_huber <- function(x, w, k = 1.5, type = "rht", info = FALSE,
   na.rm = FALSE, ...){
   if(missing(w)) stop('Argument w (weights) is missing, with no default.')
   res <- weighted_mean_huber(x, w, k, type, info, na.rm, ...)
   if(length(res) == 1){
      res <- res * sum(w)
   }else{
      res$characteristic <- "total"
      res$estimate <- res$estimate * sum(w)
   }
   return(res)
}
#' @rdname huberwgt
#' @examples
#' library(survey)
#' data(api)
#' dstrat <- svydesign(id=~1, strata=~stype, weights=~pw, data=apistrat, fpc=~fpc)
#' svymean_huber(~api00, dstrat, k = 2)
#' # Domain estimates
#' svyby(~api00, by = ~stype, design = dstrat, svymean_huber, k = 1.34)
#' @export svymean_huber
#' @importFrom stats model.frame na.fail
#' @importFrom stats weights
#' @useDynLib robsurvey rwlslm
svymean_huber <- function(x, design, k = 1.5, type = "rht", ...){
   ctrl <- rht_control(...)
   if (class(x) == "formula"){
      mf <- model.frame(x, design$variables, na.action = na.fail)
      n <- nrow(mf)
      if (ncol(mf) > 1) stop("Argument 'y' must be a formula of one single variable")
      yname <- names(mf)
      y <- mf[[1]]
   }else{
      if (is.character(x)){
	 yname <- x
	 y <- design$variables[, x]
	 n <- length(y)
	 if (any(is.na(y))) stop(paste0("Variable '", yname, "' must not contain NA's\n"))
      }else{
	 stop("svymean_huber is not defined for object of class: ", class(x), "\n")
      }
   }
   w <- as.numeric(weights(design))
   x <- switch(type,
      "rht" = rep(1, n),
      "rwm" = mean(w) / w)
   # estimate by irwls
   tmp <- .C("rwlslm", x = as.double(x), y = as.double(y), w = as.double(w),
      resid = as.double(numeric(n)), infl = as.double(numeric(n)),
      robwgt = as.double(numeric(n)), n = as.integer(n), p = as.integer(1),
      k = as.double(k), beta = as.double(numeric(1)),
      scale = as.double(numeric(1)), maxit = as.integer(ctrl$maxit),
      tol = as.double(ctrl$acc), psi = as.integer(ctrl$psi))
   if(tmp$maxit == 0) message("IRWLS algorithm did not converge!\n")
   names(tmp$beta) <- yname
   # compute variance (using influence function values)
   design$variables$zz <- tmp$infl
   v <- as.numeric(attr(survey::svymean(~zz, design), "var"))
   design$variables$zz <- NULL
   robweights <- tmp$robwgt
   outliers <- 1 * (abs(robweights) < 1)
   res <- list(characteristic = "mean",
      estimator = paste0("M-estimator (", type, ")"),
      estimate = tmp$beta,
      variance = v,
      robust = list(psifunction = ifelse(ctrl$psi == 0, "Huber", "asymHuber"),
	 k = k, robweights = robweights, outliers = outliers, scale = tmp$scale),
      optim = list(converged = (tmp$maxit != 0), niter = ifelse(tmp$maxit == 0,
	 ctrl$maxit, tmp$maxit)),
      residuals = tmp$resid,
      model = list(y = tmp$y, x = tmp$x, w = tmp$w),
      design = design,
      call = match.call())
   class(res) <- "svystat.rob"
   res
}
#' @rdname huberwgt
#' @export svytotal_huber
#' @importFrom stats weights
svytotal_huber <- function(x, design, k = 1.5, ...){
   tmp <- svymean_huber(x, design, k, type = "rht", ...)
   tmp$characteristic <- "total"
   sumw <- sum(weights(design))
   tmp$estimate <- tmp$estimate * sumw
   tmp$variance <- tmp$variance * sumw^2
   tmp
}



#' @name trimwgt
#' @aliases weighted_mean_trimmed
#' @aliases weighted_total_trimmed
#' @aliases svymean_trimmed
#' @aliases svytotal_trimmed
#'
#' @title Weighted trimmed mean and trimmed total
#'
#' @description Weighted trimmed estimators of the mean and total are available in two forms:
#' \itemize{
#'    \item \strong{bare-bone} functions: \code{weighted_mean_trimmed} and
#'	 \code{weighted_total_trimmed},
#'    \item estimation \strong{methods}: \code{svymean_trimmed} and
#'	 \code{svytotal_trimmed} (incl. variance estimation
#'	 based on the functionality of the \pkg{survey} package).
#' }
#'
#' @details \describe{
#'    \item{\emph{Overview}}{
#'    Robust trimmed Horvitz--Thompson total or Hajek mean
#'	 \itemize{
#'	 \item bare-bone functions: return the estimate (no variance estimation)
#'	 \item estimation methods on the basis of \pkg{survey} (incl. variance estimation)
#'	 }
#'    }
#'    \item{\emph{Variance}}{
#'    Variance estimates of the mean or total estimator are computed as first-order
#'    linearization using the design-based-estimation capabilities available
#'    in package \pkg{survey}.
#'    }
#'    \item{\emph{Domain estimation}}{
#'    Estimates for domains can be obtained using the \link[survey]{svyby}
#'    wrapper in the \pkg{survey} package (see examples).
#'    }
#' }
#'
#' @section Utility functions:
#' For the methods \code{svymean_trimmed} and \code{svytotal_trimmed}, the following
#' utility functions can be used
#' \itemize{
#'    \item \code{summary} gives a summary of the estimation properties
#'    \item \code{\link{robweights}} retrieves the robustness weights
#'    \item \code{coef}, \code{vcov}, \code{residuals}, and \code{fitted}
#'	 retrieve, respectively, the estimate, variance, residuals and fitted
#'	 values
#' }
#'
#' @note \code{trimwgt} is a generic name for the functions documented.
#'
#' @param x numeric vector (\code{weighted_mean_trimmed} or \code{weighted_total_trimmed});
#' a formula object or variable name (\code{svymean_trimmed} or \code{svytotal_trimmed})
#' @param w numeric vector of weights
#' @param design a \code{survey.design} object (see \link[survey]{svydesign} in \pkg{survey})
#' @param LB lower bound of trimming, such that \eqn{0 \leq LB < UB \leq 1}
#' @param UB upper bound of trimming, such that \eqn{0 \leq LB < UB \leq 1}
#' @param ... additional arguments (not used)
#' @param na.rm a logical value indicating whether \code{NA} values should be
#' stripped before the computation proceeds.
#'
#' @return Estimate (scalar) or object of class \code{svystat.rob}
#'
#' @seealso \code{\link{svymean_huber}}, \code{\link{svytotal_huber}},
#' \code{\link{svymean_winsorized}}, \code{\link{svytotal_winsorized}},
#' \code{\link{weighted_mean_huber}}, \code{\link{weighted_total_huber}},
#' \code{\link{weighted_mean_winsorized}}, \code{\link{weighted_total_winsorized}}
#'
#' @rdname trimwgt
#' @export weighted_mean_trimmed
#' @importFrom stats na.omit
#' @useDynLib robsurvey wmeantrimmed
weighted_mean_trimmed <- function(x, w, LB = 0.05, UB = 1 - LB, na.rm = FALSE){
   if(missing(w)) stop('Argument w (weights) is missing, with no default.')
   n <- length(x)
   if (length(w) != n) stop("Vectors 'x' and 'w' are not of the same dimension\n")
   if (is.factor(x) || is.factor(w) || is.data.frame(x)){
      stop("Arguments 'x' and 'w' must be numeric vectors\n")
   }
   n <- length(x); nw <- length(w)
   if (nw != n) stop("Vectors 'x' and 'w' are not of the same dimension\n")
   if (n == 0){
      return(NA)
   }
   dat <- cbind(x, w)
   if (na.rm){
      dat <- na.omit(dat)
      n <- nrow(dat)
   }else if(any(is.na(dat))) {
      return(NA)
   }
   if (LB >= UB) stop("Argument 'LB' must be smaller than 'UB'!")
   if (LB < 0) stop("Argument 'LB' must not be < 0!")
   if (UB > 1) stop("Argument 'UB' must not be > 1!")
   tmp <- .C("wmeantrimmed", x = as.double(x), w = as.double(w),
             lb = as.double(LB), ub = as.double(UB), mean = as.double(numeric(1)),
             n = as.integer(n))
   return(tmp$mean)
}
#' @rdname trimwgt
#' @export weighted_total_trimmed
weighted_total_trimmed <- function(x, w, LB = 0.05, UB = 1 - LB, na.rm = FALSE){
   if(missing(w)) stop('Argument w (weights) is missing, with no default.')
   res <- weighted_mean_trimmed(x, w, LB, UB, na.rm)
   if(length(res) == 1){
      res <- res * sum(w)
   }else{
      res$characteristic <- "total"
      res$estimate <- res$estimate * sum(w)
   }
   return(res)
}
#' @rdname trimwgt
#' @examples
#' library(survey)
#' data(api)
#' dstrat <- svydesign(id=~1, strata=~stype, weights=~pw, data=apistrat, fpc=~fpc)
#' svymean_trimmed(~api00, dstrat, LB = 0.05)
#' # Domain estimates
#' svyby(~api00, by = ~stype, design = dstrat, svymean_trimmed, LB = 0.1)
#' @export svymean_trimmed
#' @importFrom stats model.frame na.fail
#' @importFrom stats weights
svymean_trimmed <- function(x, design, LB = 0.05, UB = 1 - LB, ...){
   if (class(x) == "formula"){
      mf <- model.frame(x, design$variables, na.action = na.fail)
      n <- nrow(mf)
      if (ncol(mf) > 1) stop("Argument 'y' must be a formula of one single variable")
      yname <- names(mf)
      y <- mf[[1]]
   }else{
      if (is.character(x)){
	 yname <- x
	 y <- design$variables[, x]
	 n <- length(y)
	 if (any(is.na(y))) stop(paste0("Variable '", yname, "' must not contain NA's\n"))
      }else{
	 stop("svymean_trimmed is not defined for object of class: ", class(x), "\n")
      }
   }
   w <- as.numeric(weights(design))
   est <- weighted_mean_trimmed(y, w, LB, UB)
   names(est) <- yname
   # compute influence function
   quant <- weighted_quantile(y, w, probs = c(LB, UB))
   below <- floor(LB * n)
   above <- ceiling(UB * n)
   mat <- c(rep((1 - LB) * quant[1] - (1 - UB) * quant[2], below),
      rep(-LB * quant[1] - (1 - UB) * quant[2], (above - below)),
      rep(UB * quant[2] - LB * quant[1], (n - above)))
   if(below != 0){
      y[1:below] <- 0
   }
   if(above != n){
      y[(above + 1):n] <- 0
   }
   infl <- (y + mat) * (1 / (UB - LB)) - est
   # compute variance (using influence function values)
   design$variables$zz <- infl
   v <- as.numeric(attr(survey::svymean(~zz, design), "var"))
   design$variables$zz <- NULL
   # return value
   res <- list(characteristic = "mean",
      estimator = paste0("Weighted trimmed estimator (LB = ", LB, ", UB = ",
	 UB, ")"),
      estimate = est,
      variance = v,
      robust = list(robweights = NULL),
      optim = NULL,
      residuals = y - est,
      model = list(y = y, w = w),
      design = design,
      call = match.call())
   class(res) <- "svystat.rob"
   res
}
#' @rdname trimwgt
#' @export svytotal_trimmed
#' @importFrom stats weights
svytotal_trimmed <- function(x, design, LB = 0.05, UB = 1 - LB, ...){
   tmp <- svymean_trimmed(x, design, LB, UB)
   tmp$characteristic <- "total"
   sumw <- sum(weights(design))
   tmp$estimate <- tmp$estimate * sumw
   tmp$variance <- tmp$variance * sumw^2
   tmp
}



#' @name winswgt
#' @aliases weighted_mean_winsorized
#' @aliases weighted_total_winsorized
#' @aliases svymean_winsorized
#' @aliases svytotal_winsorized
#'
#' @title Weighted winsorized mean and trimmed total
#'
#' @description Weighted winsorized estimators of the mean and total are available in two forms:
#' \itemize{
#'    \item \strong{bare-bone} functions: \code{weighted_mean_winsorized} and
#'	 \code{weighted_total_winsorized},
#'    \item estimation \strong{methods}: \code{svymean_winsorized} and
#'	 \code{svytotal_winsorized} (incl. variance estimation
#'	 based on the functionality of the \pkg{survey} package).
#' }
#'
#' @details \describe{
#'    \item{\emph{Overview}}{
#'    Robust winsorized Horvitz--Thompson total or Hajek mean
#'	 \itemize{
#'	 \item bare-bone functions: return the estimate (no variance estimation)
#'	 \item estimation methods on the basis of \pkg{survey} (incl. variance estimation)
#'	 }
#'    }
#'    \item{\emph{Variance}}{
#'    Variance estimates of the mean or total estimator are computed as first-order
#'    linearization using the design-based-estimation capabilities available
#'    in package \pkg{survey}.
#'    }
#'    \item{\emph{Domain estimation}}{
#'    Estimates for domains can be obtained using the \link[survey]{svyby}
#'    wrapper in the \pkg{survey} package (see examples).
#'    }
#' }
#'
#' @section Utility functions:
#' For the methods \code{svymean_winsorized} and \code{svytotal_winsorized}, the following
#' utility functions can be used
#' \itemize{
#'    \item \code{summary} gives a summary of the estimation properties
#'    \item \code{\link{robweights}} retrieves the robustness weights
#'    \item \code{coef}, \code{vcov}, \code{residuals}, and \code{fitted}
#'	 retrieve, respectively, the estimate, variance, residuals and fitted
#'	 values
#' }
#'
#' @note \code{winswgt} is a generic name for the functions documented.
#'
#' @param x numeric vector (\code{weighted_mean_winsorized} or
#' \code{weighted_total_winsorized}); a formula object or variable name
#' (\code{svymean_winsorized} or \code{svytotal_winsorized})
#' @param w numeric vector of weights
#' @param design a \code{survey.design} object (see \link[survey]{svydesign}
#' in \pkg{survey})
#' @param LB lower bound of winsorizing, such that \eqn{0 \leq LB < UB \leq 1}
#' @param UB upper bound of winsorizing, such that \eqn{0 \leq LB < UB \leq 1}
#' @param ... additional arguments (not used)
#' @param na.rm a logical value indicating whether \code{NA} values should be
#' stripped before the computation proceeds.
#'
#' @return Estimate (scalar) or object of class \code{svystat.rob}
#'
#' @seealso \code{\link{svymean_huber}}, \code{\link{svytotal_huber}},
#' \code{\link{svymean_trimmed}}, \code{\link{svytotal_trimmed}},
#' \code{\link{weighted_mean_huber}}, \code{\link{weighted_total_huber}},
#' \code{\link{weighted_mean_trimmed}}, \code{\link{weighted_total_trimmed}}
#'
#' @rdname winswgt
#' @export weighted_mean_winsorized
#' @importFrom stats na.omit
#' @useDynLib robsurvey wmeanwinsorized
weighted_mean_winsorized <- function(x, w, LB = 0.05, UB = 1 - LB,
   na.rm = FALSE){
   if(missing(w)) stop('Argument w (weights) is missing, with no default.')
   n <- length(x)
   if (length(w) != n) stop("Vectors 'x' and 'w' are not of the same dimension\n")
   if (is.factor(x) || is.factor(w) || is.data.frame(x)){
      stop("Arguments 'x' and 'w' must be numeric vectors\n")
   }
   n <- length(x); nw <- length(w)
   if (nw != n) stop("Vectors 'x' and 'w' are not of the same dimension\n")
   if (n == 0){
      return(NA)
   }
   dat <- cbind(x, w)
   if (na.rm){
      dat <- na.omit(dat)
      n <- nrow(dat)
   }else if(any(is.na(dat))) {
      return(NA)
   }
   if (LB >= UB) stop("Argument 'LB' must be smaller than 'UB'!")
   if (LB < 0) stop("Argument 'LB' must not be < 0!")
   if (UB > 1) stop("Argument 'UB' must not be > 1!")
   tmp <- .C("wmeanwinsorized", x = as.double(x), w = as.double(w),
      lb = as.double(LB), ub = as.double(UB), mean = as.double(numeric(1)),
      n = as.integer(n))
   return(tmp$mean)
}
#' @rdname winswgt
#' @export weighted_total_winsorized
weighted_total_winsorized <- function(x, w, LB = 0.05, UB = 1 - LB, na.rm = FALSE){
   if(missing(w)) stop('Argument w (weights) is missing, with no default.')
   res <- weighted_mean_winsorized(x, w, LB, UB, na.rm)
   if(length(res) == 1){
      res <- res * sum(w)
   }else{
      res$characteristic <- "total"
      res$estimate <- res$estimate * sum(w)
   }
   return(res)
}
#' @rdname winswgt
#' @examples
#' library(survey)
#' data(api)
#' dstrat <- svydesign(id=~1, strata=~stype, weights=~pw, data=apistrat, fpc=~fpc)
#' svymean_winsorized(~api00, dstrat, LB = 0.05)
#' # Domain estimates
#' svyby(~api00, by = ~stype, design = dstrat, svymean_winsorized, LB = 0.1)
#' @export svymean_winsorized
#' @importFrom stats model.frame na.fail
#' @importFrom stats weights
svymean_winsorized <- function(x, design, LB = 0.05, UB = 1 - LB, ...){
   if (class(x) == "formula"){
      mf <- model.frame(x, design$variables, na.action = na.fail)
      n <- nrow(mf)
      if (ncol(mf) > 1) stop("Argument 'y' must be a formula of one single variable")
      yname <- names(mf)
      y <- mf[[1]]
   }else{
      if (is.character(x)){
	 yname <- x
	 y <- design$variables[, x]
	 n <- length(y)
	 if (any(is.na(y))) stop(paste0("Variable '", yname, "' must not contain NA's\n"))
      }else{
	 stop("svymean_winsorized is not defined for object of class: ", class(x), "\n")
      }
   }
   w <- as.numeric(weights(design))
   est <- weighted_mean_winsorized(y, w, LB, UB)
   names(est) <- yname
   # compute influence function
   # FIXME: variance => density estimate (now, it is trimmed mean variance)
   quant <- weighted_quantile(y, w, probs = c(LB, UB))
   below <- floor(LB * n)
   above <- ceiling(UB * n)
   mat <- c(rep((1 - LB) * quant[1] - (1 - UB) * quant[2], below),
      rep(-LB * quant[1] - (1 - UB) * quant[2], (above - below)),
      rep(UB * quant[2] - LB * quant[1], (n - above)))
   if(below != 0){
      y[1:below] <- 0
   }
   if(above != n){
      y[(above + 1):n] <- 0
   }
   infl <- (y + mat) * (1 / (UB - LB)) - est
   # compute variance (using influence function values)
   design$variables$zz <- infl
   v <- as.numeric(attr(survey::svymean(~zz, design), "var"))
   design$variables$zz <- NULL
   # return value
   res <- list(characteristic = "mean",
      estimator = paste0("Weighted winsorized estimator (LB = ", LB, ", UB = ",
	 UB, ")"),
      estimate = est,
      variance = v,
      robust = list(robweights = NULL),
      optim = NULL,
      residuals = y - est,
      model = list(y = y, w = w),
      design = design,
      call = match.call())
   class(res) <- "svystat.rob"
   res
}
#' @rdname winswgt
#' @export svytotal_winsorized
#' @importFrom stats weights
svytotal_winsorized <- function(x, design, LB = 0.05, UB = 1 - LB, ...){
   tmp <- svymean_winsorized(x, design, LB, UB)
   tmp$characteristic <- "total"
   sumw <- sum(weights(design))
   tmp$estimate <- tmp$estimate * sumw
   tmp$variance <- tmp$variance * sumw^2
   tmp
}



#' Weighted robust line fitting
#'
#' \code{weighted_line} fits a robust line and allows weights.
#'
#' Uses different quantiles for splitting the sample than \code{line()}.
#' Is based on \code{weighted_median()}.
#'
#' @param x a numeric vector (explanatory variable)
#' @param y a numeric vector (response variable)
#' @param w a numeric vector of weights
#' @param iter number of iterations for enhancing the slope
#' @param na.rm a logical value indicating whether rows with \code{NA}
#' values should be stripped before the computation proceeds
#' @return intercept and slope of the fitted line
#' @examples
#' data(cars)
#' weighted_line(cars$speed, cars$dist, w=rep(1, length(cars$speed)))
#' weighted_line(cars$speed, cars$dist, w=rep(1:10, each=5))
#' @seealso \code{\link[stats]{line}}
#' @export weighted_line
#' @importFrom stats model.frame complete.cases
weighted_line <- function(x, y=NULL, w, na.rm=FALSE, iter = 1){

  if(missing(w)) stop('Argument w (weights) is missing, with no default.')

  # quantiles as implemented in line() but with weights
  # x and w sorted according to x
  line.quantile <- function(x, w, prob){
    n <- length(x)
    cumw <- n * cumsum(w)/sum(w)
    low <- min(which(cumw >= (n-1) * prob))
    high <- max(which(cumw <= (n-1) * prob))
    return((x[low+1] + x[high+1])/2)
  }

  if (inherits(x, "formula")) {
    mf <- model.frame(x)
    y <- mf[, 1]
    x <- mf[, -1]
  }
  if (NCOL(x) > 1) stop("x contains more than 1 explanatory variables.")
  dat <- data.frame(x, y, w)
  ok <- complete.cases(dat$x, dat$y, dat$w)
  n <- sum(ok)
  if (n < length(x)) {
    if (na.rm) {x <- dat$x[ok]; y <- dat$y[ok]; w <- dat$w[ok]} else
      stop("There are missing values in x, y or w.\n")
  }

  # Sort data according to x
  ord <- order(x)
  x <- x[ord]
  y <- y[ord]

  # standardise weights to n
  w <- n * w[ord] / sum(w)

  # Groups
  lim1 <- line.quantile(x, w, 1/3)
  lim2 <- line.quantile(x, w, 2/3)
  groups <- (x <= lim1) * 1 + (x > lim1 & x < lim2) * 2 + (x >= lim2) * 3

  # Medians for x
  wmedx <- c(weighted_median(x[groups==1], w[groups==1]),
             weighted_median(x[groups==3], w[groups==3]))

  # polishing (affects only the slope)
  slope <- 0; r <- y; j <- 0
  while (j <= iter - 1) {
    wmedr <- c(weighted_median(r[groups==1], w[groups==1]),
               weighted_median(r[groups==3], w[groups==3]))
    slope <- slope + (wmedr[2] - wmedr[1]) / (wmedx[2] - wmedx[1])
    r <- y - slope * x
    j <- j + 1
  }

  # intercept and predicted values
  intercept <- weighted_median(r, w)
  yhat <- intercept + slope * x

  # return
  structure(list(call = sys.call(), coefficients = c(intercept, slope),
                 residuals = y - yhat, fitted.values = yhat), class = "tukeyline")
}



#' Robust simple linear regression based on medians
#'
#' For type medslopes the median individual ratios response/explanatory
#' is used as estimator of the slope. For version ratiomeds the ratio of
#' the median crossproduct to the median of squares of the explanatory
#' variable is used as the estimator of the slope. Survey weights may be used.
#' Missing values are neglected.
#'
#' Uses \code{weighted_median()}. The median of slopes (type="slopes")
#' uses \eqn{b1=M((y-M(y,w))/(x-M(x,w)), w)}. The median of crossproducts
#' by median of squares (type="products") uses
#' \eqn{b1=M((y-M(y,w))(x-M(x,w)), w )/ M((x-M(x,w )^2), w )}, where
#' \eqn{M(x, w)} is shorthand for the function \code{weighted_median(x, w)}.
#' The function allows weights and missing values.
#'
#' @param x a numeric vector (explanatory variable)
#' @param y a numeric vector (response variable)
#' @param w a numeric vector of weights
#' @param type either "slopes" (default) or "products"
#' @param na.rm a logical value indicating whether rows with \code{NA}
#' values should be stripped before the computation proceeds
#' @return a vector with two components: intercept and slope
#' @examples
#' x <- c(1, 2, 4, 5)
#' y <- c(3, 2, 7, 4)
#' weighted_line(y~x, w=rep(1, length(x)))
#' weighted_median_line(y~x, w=rep(1, length(x)))
#' weighted_median_line(y~x, w=rep(1, length(x)), type="prod")
#'
#' data(cars)
#' with(cars, weighted_median_line(dist ~ speed, w=rep(1, length(dist))))
#' with(cars, weighted_median_line(dist ~ speed, w=rep(1, length(dist)), type="prod"))
#'
#' # weighted
#' w <- c(rep(1,20), rep(2,20), rep(5, 10))
#' with(cars, weighted_median_line(dist ~ speed, w=w))
#' with(cars, weighted_median_line(dist ~ speed, w=w, type="prod"))
#'
#' # outlier in y
#' cars$dist[49] <- 360
#' with(cars, weighted_median_line(dist ~ speed, w=w))
#' with(cars, weighted_median_line(dist ~ speed, w=w, type="prod"))
#'
#' # outlier in x
#' data(cars)
#' cars$speed[49] <- 72
#' with(cars, weighted_median_line(dist ~ speed, w=w))
#' with(cars, weighted_median_line(dist ~ speed, w=w, type="prod"))
#' @seealso \code{\link[stats]{line}}, \code{\link{weighted_line}},
#' \code{\link{weighted_median_ratio}}
#' @export weighted_median_line
#' @importFrom stats complete.cases
#' @importFrom grDevices xy.coords
weighted_median_line <- function(x, y=NULL, w, type="slopes", na.rm=FALSE){

  if(missing(w)) stop('Argument w (weights) is missing, with no default.')

  if (inherits(x, "formula")) {
    dat <- xy.coords(x)
    x <- dat$x
    y <- dat$y
  }
  if (NCOL(x) > 1) stop("x contains more than 1 explanatory variables.")

  # Robustification type
  stype <- pmatch(type, c("slopes"), nomatch=2)

  # Ensure case-wise completeness
  ok <- complete.cases(data.frame(x, y, w))
  n <- sum(ok)
  if (n < length(x)) {
    if (na.rm) {x <- x[ok]; y <- y[ok]; w <- w[ok]} else
      stop("There are missing values in x, y or w.")
  }

  # univariate medians
  wmedx <- weighted_median(x, w)
  wmedy <- weighted_median(y, w)

  # slope (remove NA created due to division by 0)
  slope <- switch(stype,
                  weighted_median((y - wmedy) / (x - wmedx), w, na.rm=TRUE),
                  weighted_median((y - wmedy) * (x - wmedx), w, na.rm=TRUE) / weighted_median((x - wmedx)^2, w, na.rm=TRUE)
  )

  # residuals
  r <- y - slope * x

  # intercept
  intercept <- weighted_median(r, w)
  yhat <- intercept + slope * x

  # return
  structure(list(call = sys.call(), coefficients = c(intercept, slope),
                 residuals = y - yhat, fitted.values = yhat), class = "medline")

}



#' Weighted robust ratio based on median
#'
#' A weighted median of the ratios y/x determines the slope of a
#' regression through the origin.
#'
#' @param x a numeric vector (explanatory variable)
#' @param y a numeric vector (response variable)
#' @param w a numeric vector of (optional) weights
#' @param na.rm a logical value indicating whether rows with \code{NA}
#' values should be stripped before the computation proceeds
#' @return a vector with two components: intercept and slope
#' @examples
#' x <- c(1,2,4,5)
#' y <- c(1,0,5,2)
#' weighted_median_ratio(y~x, w = rep(1, length(y)))
#' @seealso \code{\link[stats]{line}}, \code{\link{weighted_line}},
#' \code{\link{weighted_median_line}}
#' @export weighted_median_ratio
#' @importFrom stats complete.cases
#' @importFrom grDevices xy.coords
weighted_median_ratio <- function(x, y=NULL, w, na.rm=FALSE){

  if(missing(w)) stop('Argument w (weights) is missing, with no default.')

  if (inherits(x, "formula")) {
    dat <- xy.coords(x)
    y <- dat$y
    x <- dat$x
  }
  if (NCOL(x) > 1) stop("x contains more than 1 explanatory variables.")

  # Ensure case-wise completeness
  ok <- complete.cases(data.frame(x, y, w))
  n <- sum(ok)
  if (n < length(x)) {
    if (na.rm) {x <- x[ok]; y <- y[ok]; w <- w[ok]} else
      stop("There are missing values in x, y or w.")
  }

  # ratio
  ratio <- weighted_median(y / x, w, na.rm=TRUE)

  # fitted.varlues and residuals
  yhat <- ratio * x
  r <- y - yhat

  # return
  structure(list(call = sys.call(), coefficients = ratio,
                 residuals = y - yhat, fitted.values = yhat), class = "medline")

}
martinSter/robsurvey documentation built on Oct. 11, 2019, 4:45 p.m.