R/weighting.R
In questionr: Functions to Make Surveys Processing Easier

Documented in wtd.var

##' Weighted mean and variance of a vector
##'
##' Compute the weighted mean or weighted variance of a vector. Exact copies of Hmisc
##' functions.
##'
##' @aliases wtd.var
##' @param x Numeric data vector
##' @param weights Numeric weights vector. Must be the same length as \code{x}
##' @param na.rm if \code{TRUE}, delete \code{NA} values.
##' @details
##' If \code{weights} is \code{NULL}, then an uniform weighting is applied.
##' @author
##' These functions are exact copies of the \code{wtd.mean} and \code{wtd.var}
##' function from the \link[Hmisc]{wtd.stats} package. They have been created by
##' Frank Harrell, Department of Biostatistics, Vanderbilt University School of
##' Medicine, <f.harrell@@vanderbilt.edu>.
##' @seealso
##' \code{\link{mean}},\code{\link{var}}, \code{\link{wtd.table}} and the \code{survey} package.
##' @examples
##' data(hdv2003)
##' mean(hdv2003$age)
##' wtd.mean(hdv2003$age, weights=hdv2003$poids)
##' @export wtd.mean

`wtd.mean` <-
  function(x, weights = NULL, na.rm = TRUE) {
    if (!length(weights)) {
      warning("no weights argument given, computing unweighted mean")
      return(mean(x, na.rm = na.rm))
    }
    if (na.rm) {
      s <- !is.na(x + weights)
      x <- x[s]
      weights <- weights[s]
    }
    sum(weights * x) / sum(weights)
  }



#' Weighted one-way and two-way frequency tables.
#'
#' Generate weighted frequency tables, both for one-way and two-way tables.
#'
#' @param x a vector
#' @param y another optional vector for a two-way frequency table. Must be the same length as \code{x}
#' @param weights vector of weights, must be the same length as \code{x}
#' @param normwt if TRUE, normalize weights so that the total weighted count is the same as the unweighted one
#' @param useNA wether to include NA values in the table
#' @param na.show (deprecated) if TRUE, show NA count in table output
#' @param na.rm (deprecated) if TRUE, remove NA values before computation
#' @param digits Number of significant digits.
#' @param exclude values to remove from x and y. To exclude NA, use na.rm argument.
#' @details
#' If \code{weights} is not provided, an uniform weghting is used.
#'
#' If some weights are missing (`NA`), they are converted to zero. In case of missing weights
#' with `normwt=TRUE`, the observations with missing weights are still counted in the unweighted
#' count. You have to filter them out before using this function if you don't want them to be
#' taken into account when using `normwt`.
#'
#' @return
#' If \code{y} is not provided, returns a weighted one-way frequency table
#' of \code{x}. Otherwise, returns a weighted two-way frequency table of
#' \code{x} and \code{y}
#' @seealso
#' \code{\link[Hmisc]{wtd.table}}, \command{\link{table}}, and the \code{survey} package.
#' @examples
#' data(hdv2003)
#' wtd.table(hdv2003$sexe, weights = hdv2003$poids)
#' wtd.table(hdv2003$sexe, weights = hdv2003$poids, normwt = TRUE)
#' table(hdv2003$sexe, hdv2003$hard.rock)
#' wtd.table(hdv2003$sexe, hdv2003$hard.rock, weights = hdv2003$poids)
#' @export



`wtd.table` <-
  function(x, y = NULL, weights = NULL, digits = 3, normwt = FALSE, useNA = c("no", "ifany", "always"), na.rm = TRUE, na.show = FALSE, exclude = NULL) {
    if (is.null(weights)) {
      warning("no weights argument given, using uniform weights of 1")
      weights <- rep(1, length(x))
    }
    if (length(x) != length(weights)) stop("x and weights lengths must be the same")
    if (!is.null(y) && (length(x) != length(y))) stop("x and y lengths must be the same")
    miss.usena <- missing(useNA)
    useNA <- match.arg(useNA)
    weights[is.na(weights)] <- 0
    if (normwt) {
      weights <- weights * length(x) / sum(weights)
    }

    if (!missing(na.show) || !missing(na.rm)) {
      warning("'na.rm' and 'na.show' are ignored when 'useNA' is provided.")
    }
    if (useNA != "no" || (na.show && miss.usena)) {
      if (match(NA, exclude, nomatch = 0L)) {
        warning("'exclude' containing NA and 'useNA' != \"no\"' are a bit contradicting")
      }
      x <- addNA(x)
      if (!is.null(y)) y <- addNA(y)
    }
    if (useNA == "no" || (na.rm && miss.usena)) {
      s <- !is.na(x) & !is.na(weights)
      if (!is.null(y)) s <- s & !is.na(y)
      x <- x[s, drop = FALSE]
      if (!is.null(y)) y <- y[s, drop = FALSE]
      weights <- weights[s]
    }
    if (!is.null(exclude)) {
      s <- !(x %in% exclude)
      if (!is.null(y)) s <- s & !(y %in% exclude)
      x <- factor(x[s, drop = FALSE])
      if (!is.null(y)) y <- factor(y[s, drop = FALSE])
      weights <- weights[s]
    }
    if (is.null(y)) {
      result <- tapply(weights, x, sum, simplify = TRUE)
    } else {
      result <- tapply(weights, list(x, y), sum, simplify = TRUE)
    }
    result[is.na(result)] <- 0
    tab <- as.table(result)
    if (useNA == "ifany") {
      if (!is.null(y)) {
        if (sum(tab[, is.na(colnames(tab))]) == 0) tab <- tab[, !is.na(colnames(tab))]
        if (sum(tab[is.na(rownames(tab)), ]) == 0) tab <- tab[!is.na(rownames(tab)), ]
      } else {
        if (tab[is.na(names(tab))] == 0) tab <- tab[!is.na(names(tab))]
      }
    }
    tab
  }

##' @export

wtd.var <- function(...) {
  stop("questionr::wtd.var has been removed. Please use Hmisc::wtd.var instead.")
}

#' Weighted Crossresult
#'
#' Generate table with multiple weighted crossresult (full sample is first column).
#' kable(), which is found in library(knitr), is recommended for use with RMarkdown.
#'
#' @param df A data.frame that contains \code{x} and (optionally) \code{y} and \code{weight}.
#' @param x variable name (found in \code{df}). tabs(my.data, x = 'q1').
#' @param y one (or more) variable names. tabs(my.data, x = 'q1', y = c('sex', 'job')).
#' @param weight variable name for weight (found in \code{df}).
#' @param type 'percent' (default ranges 0-100), 'proportion', or 'counts' (type of table returned).
#' @param percent if \code{TRUE}, add a percent sign after the values when printing
#' @param normwt if TRUE, normalize weights so that the total weighted count is the same as the unweighted one
#' @param na.show if TRUE, show NA count in table output
#' @param na.rm if TRUE, remove NA values before computation
#' @param exclude values to remove from x and y. To exclude NA, use na.rm argument.
#' @param digits Number of digits to display; ?format.proptab for formatting details.
#' @details tabs calls wtd.table on `\code{x}` and, as applicable, each variable named by `\code{y}`.
#' @author Pete Mohanty
#' @examples
#' data(hdv2003)
#' tabs(hdv2003, x = "relig", y = c("qualif", "trav.imp"), weight = "poids")
#' result <- tabs(hdv2003, x = "relig", y = c("qualif", "trav.imp"), type = "counts")
#' format(result, digits = 3)
#' # library(knitr)
#' # xt <- tabs(hdv2003, x = "relig", y = c("qualif", "trav.imp"), weight = "poids")
#' # kable(format(xt))                        # to use with RMarkdown...
#'
#' @export

`tabs` <- function(df, x, y,
                   type = "percent", percent = FALSE,
                   weight = NULL, normwt = FALSE,
                   na.rm = TRUE, na.show = FALSE, exclude = NULL, digits = 1) {
  sumOne <- function(x, ...) x / sum(x, ...)

  if (!(type %in% c("percent", "proportion", "counts"))) {
    stop("type must either be 'percent', 'proportion', or 'counts'.")
  }

  if (!inherits(df, "data.frame")) {
    stop("df must be a data.frame")
  }

  if (!(x %in% names(df))) {
    stop(paste(x, "not found in data frame."))
  }
  if (min(match(y, names(df), nomatch = 0L)) == 0L) {
    stop(paste(y, "not found in data frame."))
  }
  if (!is.null(weight) && !(weight %in% names(df))) {
    stop(paste(weight, "not found in data frame."))
  }

  w <- if (is.null(weight)) NULL else df[[weight]]

  result <- wtd.table(df[[x]],
    y = NULL, weights = w,
    normwt = normwt, na.rm = na.rm, na.show = na.show, exclude = exclude
  )
  if (type %in% c("percent", "proportion")) {
    result <- sumOne(result, na.rm = na.rm)
  }

  for (v in y) {
    tmp <- wtd.table(df[[x]], df[[v]],
      weights = w,
      normwt = normwt, na.rm = na.rm, na.show = na.show, exclude = exclude
    )
    if (type %in% c("percent", "proportion")) tmp <- sumOne(tmp, na.rm = na.rm)
    result <- cbind(result, tmp)
  }
  if (type == "percent") {
    result <- 100 * result
  }

  colnames(result)[1] <- "Overall"
  class(result) <- c("proptab", class(result))

  attr(result, "percent") <- percent
  if (type != "percent") {
    attr(result, "percent") <- FALSE
  }
  attr(result, "digits") <- digits

  return(result)
}