R/proptest.R

#' Test of proportions with improved layout
#' 
#' Performs a one- or two-sample test of proportions using data. This test can be approximate or exact.
#' 
#' Missing values must be given by \code{"NA"}s to be recognized as missing values. 
#' Numeric data must be given in 0-1 form. 
#' This function also accepts binary factor variables, treating the higher level as 1 and the lower level 
#' as 0, or logical variables. 
#' 
#' @aliases proptest proptest.do plot.proptest print.proptest proptest.default
#' 
#' @param var1 a (non-empty) vector of binary numeric (0-1), binary factor, or logical
#' data values
#' @param var2 an optional (non-empty)
#' vector of binary numeric (0-1), binary factor, or logical
#' data values
#' @param by a variable of equal length to
#' that of \code{var1} with two outcomes (numeric or factor). This will be used to define strata
#' for a prop test on \code{var1}.
#' @param exact If true, performs a
#' test of equality of proportions using exact binomial probabilities.
#' @param null.hypoth a number specifying the
#' null hypothesis for the mean (or difference in means if performing a
#' two-sample test). Defaults to 0.5 for a one-sample test and 0 for a two-sample test.
#' @param alternative a string: one of
#' \code{"less"}, \code{"two.sided"}, or \code{"greater"} specifying the form
#' of the test. Defaults to a two-sided test. 
#' @param conf.level confidence level of the
#' test. Defaults to 0.95.
#' @param correct a logical indicating whether to perform a continuity correction
#' @param more.digits a numeric value
#' specifying whether or not to display more or fewer digits in the output.
#' Non-integers are automatically rounded down. 
#' 
#' @return A list of class \code{proptest}. The print method lays out the information in an easy-to-read
#' format. 
#' \item{tab}{A formatted table of descriptive and inferential results (total number of observations,
#' number of missing observations, sample proportion, standard error of the proportion estimate), 
#' along with a confidence interval for the underlying proportion.}
#' \item{zstat}{the value of the test
#' statistic, if using an approximate test.} 
#' \item{pval}{the p-value
#' for the test} 
#' \item{var1}{The user-supplied first data vector. }
#' \item{var2}{The user-supplied second data vector. }
#' \item{by}{The user-supplied stratification variable.}
#' \item{par}{A vector of information about the type of test (null hypothesis, alternative hypothesis, etc.)}

#' 
#' @seealso \code{\link[stats]{prop.test}}
#' @examples
#' 
#' # Read in data set
#' data(psa)
#' attach(psa)
#' 
#' # Define new binary variable as indicator
#' # of whether or not bss was worst possible
#' bssworst <- bss
#' bssworst[bss == 1] <- 0
#' bssworst[bss == 2] <- 0
#' bssworst[bss == 3] <- 1
#' 
#' 
#' # Perform test comparing proportion in remission
#' # between bss strata
#' proptest(factor(inrem), by = bssworst)
#' 
#' @export proptest
proptest <- function (var1, var2 = NULL, by = NULL, exact = FALSE, 
                    null.hypoth = ifelse(is.null(var2) && is.null(by), 0.5, 0), alternative = "two.sided", 
                    conf.level = 0.95, correct = FALSE, more.digits = 0) 
{
  proptest.do <- function(var1, var2 = NULL, by = NULL, 
                       exact = FALSE, null.hypoth = ifelse(is.null(var2) && is.null(by), 0.5, 0), alternative = "two.sided", 
                       conf.level = 0.95, correct = FALSE, more.digits = 0, 
                       myargs, ...) {
    # can only do var1 vs var2 or var1 by
    if (length(var2) > 0 & length(by) > 0) {
      stop("Please specify only one of the variables 'by' or 'var2'")
    }
    # check that alternative is one of "less", "two.sided", or "greater"
    if (!(alternative %in% c("less", "two.sided", "greater"))) {
      stop("'alternative' must be either 'less', 'two.sided', or 'greater'")
    }
    if (!(is.scalar(null.hypoth))){
      stop("Null must be a scalar")
    }
    if (!is.scalar(conf.level) || conf.level < 0 || conf.level > 1){
      stop("'conf.level' must a scalar between 0 and 1.")
    }
    if (!(is.numeric(var1) && isTRUE(all.equal(sort(unique(var1)), c(0,1)))) &&
        !(is.factor(var1[!is.na(var1)]) && length(sort(unique(var1[!is.na(var1)]))) == 2) &&
        !is.logical(var1)){
      stop("Only binary 0-1 data, two-level factors, and logicals are allowed.")
    }
    if (length(var2) > 0){
      if ((!(is.numeric(var2) && isTRUE(all.equal(sort(unique(var2)), c(0,1)))) &&
          !(is.factor(var2[!is.na(var2)]) && length(sort(unique(var2[!is.na(var2)]))) == 2) &&
          !is.logical(var2)) ||
        !isTRUE(all.equal(as.character(sort(unique(var1[!is.na(var1)]))), as.character(sort(unique(var2[!is.na(var2)])))))) {
        stop("Only binary 0-1 data, two-level factors, and logicals are allowed.")
      }
    }
    if (!is.logical(exact)){
      stop("'exact' must be a logical.")
    }
    if (!is.logical(correct)){
      stop("'correct' must be a logical.")
    }
    if ((length(var2) > 0 || length(by) > 0) && exact){
      stop("Exact binomial test not available for two samples.")  
    }
    if (null.hypoth != 0 && !is.null(var2)){
      stop("Two sample test only allows a null of 0.")
    }
    #levs <- NULL
    #levs1 <- NULL
    #levs2 <- NULL
    # convert factor to numeric
    if (is.factor(var1[!is.na(var1)])){
      levs1 <- levels(var1[!is.na(var1)])
      var1 <- as.numeric(var1) - 1
    }
    if (length(var2) > 0){
      if (is.factor(var2[!is.na(var2)])){
        levs2 <- levels(var2[!is.na(var2)])
        var2 <- as.numeric(var2) - 1
      }
    }
    #levs <- unique(c(levs1, levs2))

    # can't specify null for two-sample test
    # get length of stratification variable (byt)
    byt <- sort(unique(by),decreasing=FALSE)
    byt <- byt[!is.na(byt)]
    # check to make sure additional digit request is numeric
    if (!is.numeric(more.digits)) {
      stop("Argument 'more.digits' must be numeric")
    }
    # if by is a factor, turn it into a numeric
    if (is.factor(by)) {
      by <- as.numeric(by)
    }
    # something with more digits
    more.digits <- max(floor(more.digits), 0)
    # ensure by variable has length 2 (no more or less)
    if (!is.null(by[1]) && length(unique(by)) == 1) {
      stop("Variable 'by' only has one unique value")
    }
    
    if (length(by) > 0) {
      if (length(unique(by)) > 2 & sum(is.na(by)) == 0) {
        stop("Variable 'by' has more than two unique values.")
      }
      if (length(by) != length(var1)) {
        stop("Variable 'by' is not of equal length to data vector")
      }
      # stratify var1 by by variable
      var1<-var1[!is.na(by)]
      by<-by[!is.na(by)]
      by1 <- sort(unique(by), decreasing = TRUE)[1]
      by2 <- sort(unique(by), decreasing = TRUE)[2]
      var2 <- var1[by == by1]
      var1 <- var1[by == by2]
    }
    digits <- 3 + more.digits
    
    cl <- (1 + conf.level)/2
    # Case where by is not entered
    if (length(by) == 0 & is.null(by[1])) {
      # var2 is not entered
      if (length(var2) == 0 & is.null(var2)) {
        ns <- c(length(var1))
        msg <- sum(is.na(var1))
        var1 <- var1[!is.na(var1)]
        # here we do one sample prop test with var1 
        est1 <- mean(var1)
        se <- NULL
        cil <- NULL
        cih <- NULL
        zstat <- NULL
        pval <- NULL
        est1 <- as.numeric(format(est1), digits = digits)
        
        if (exact) {
          binom <- stats::binom.test(x = sum(var1), n = length(var1), p = null.hypoth, 
                              alternative = alternative, conf.level = conf.level)
          se <- sqrt(est1*(1-est1)/length(var1))
          cil <- as.numeric(format(min(binom$conf.int), 
                                   digits = digits))
          cih <- as.numeric(format(max(binom$conf.int), 
                                   digits = digits))
          se <- as.numeric(format(se, digits = digits))
          pval <- as.numeric(format(binom$p.value, 
                                    digits = digits))
        } else {
          chisq <- stats::prop.test(x = sum(var1), n = length(var1), conf.level = conf.level, p = null.hypoth,
                             alternative = alternative, correct = correct)
          zstat <- as.numeric(format(sign(est1 - null.hypoth)*sqrt(chisq$statistic), 
                                         digits = digits))
          pval <- as.numeric(format(chisq$p.value, 
                                      digits = digits))
          se <- sqrt(est1*(1-est1)/length(var1))
          cil <- est1 - stats::qnorm(cl) * se
          cih <- est1 + stats::qnorm(cl) * se
          cil <- as.numeric(format(cil, digits = digits))
          cih <- as.numeric(format(cih, digits = digits))
          se <- as.numeric(format(se, digits = digits))
        }
        main <- matrix(c(myargs[1], ns[1], msg[1], 
                          est1, se, paste("[", cil, ", ", cih, 
                                                 "]", sep = "")), ncol = 6)
        main <- data.frame(main)
        names(main) <- c("Variable", "Obs", "Missing", 
                          "Estimate", "Std. Err.", paste(conf.level * 100, "% CI", sep = ""))
        row.names(main) <- c("")
      }
      # two-sample, not using by 
      if (length(var2) > 0 && is.null(by)) {
        ns <- c(length(var1), length(var2), length(var1) + length(var2))
        msg <- c(sum(is.na(var1)), sum(is.na(var2)), sum(is.na(var1)) + sum(is.na(var2)))
        var1 <- var1[!is.na(var1)]
        var2 <- var2[!is.na(var2)]
        est1 <- mean(var1)
        est2 <- mean(var2)
        se <- NULL
        cil <- NULL
        cih <- NULL
        zstat <- NULL
        pval <- NULL
        est1 <- as.numeric(format(est1), digits = digits)
        est2 <- as.numeric(format(est2), digits = digits)
        est_diff <- est1 - est2
        # no two-sample exact test
        se <- c(sqrt(est1 * (1-est1)/length(var1)),
                sqrt(est2 * (1-est2)/length(var2)),
                sqrt((est1 * (1-est1))/length(var1) + (est2*(1-est2))/length(var2)))
        cil <- c(est1 - stats::qnorm(cl)*se[1],
                 est2 - stats::qnorm(cl)*se[2],
                 est1 - est2 - stats::qnorm(cl) * se[3])
        cih <- c(est1 + stats::qnorm(cl)*se[1],
                 est2 + stats::qnorm(cl)*se[2],
                 est1 - est2 + stats::qnorm(cl) * se[3])
        chisq <- stats::prop.test(x = c(sum(var1), sum(var2)),
                                  n = c(length(var1), length(var2)),
                                  conf.level = conf.level, 
                                  alternative = alternative, 
                                  correct = correct)
        zstat <- as.numeric(format(sign(est_diff)*sqrt(chisq$statistic), 
                                   digits = digits))
        pval <- as.numeric(format(chisq$p.value, 
                                  digits = digits))
        cil <- as.numeric(format(cil, digits = digits))
        cih <- as.numeric(format(cih, digits = digits))
        se <- as.numeric(format(se, digits = digits))
        est_diff <- as.numeric(format(est_diff), digits = digits)
        main <- matrix(c(myargs[1], ns[1], msg[1], est1, 
                           se[1], paste("[", cil[1], ", ", cih[1], "]", sep = ""), 
                         myargs[2], ns[2], msg[2], 
                           est2, se[2],  paste("[", cil[2], ", ", cih[2], "]", sep = ""), 
                         "Difference", ns[3], msg[3], 
                            est_diff, se[3], paste("[", cil[3], ", ", cih[3], "]", sep = "")),
                       ncol = 6, byrow = TRUE)
        main <- data.frame(main)
        names(main) <- c("Group", "Obs", "Missing", 
                          "Mean", "Std. Err.", paste(conf.level * 100, "% CI", sep = ""))
        row.names(main) <- c("", " ", "  ")
      }
    }
    # two-sample, using by
    if (length(by) > 0) {
      ns <- c(length(var1), length(var2), length(var1) + length(var2))
      msg <- c(sum(is.na(var1)), sum(is.na(var2)), sum(is.na(var1)) + sum(is.na(var2)))
      var1 <- var1[!is.na(var1)]
      var2 <- var2[!is.na(var2)]
      est1 <- mean(var1)
      est2 <- mean(var2)
      se <- NULL
      cil <- NULL
      cih <- NULL
      zstat <- NULL
      pval <- NULL
      est1 <- as.numeric(format(est1), digits = digits)
      est2 <- as.numeric(format(est2), digits = digits)
      est_diff <- est1 - est2
      est_diff <- as.numeric(format(est_diff), digits = digits)
      se <- c(sqrt(est1 * (1-est1)/length(var1)),
              sqrt(est2 * (1-est2)/length(var2)),
              sqrt((est1 * (1-est1))/length(var1) + (est2*(1-est2))/length(var2)))
      cil <- c(est1 - stats::qnorm(cl)*se[1],
               est2 - stats::qnorm(cl)*se[2],
               est1 - est2 - stats::qnorm(cl) * se[3])
      cih <- c(est1 + stats::qnorm(cl)*se[1],
               est2 + stats::qnorm(cl)*se[2],
               est1 - est2 + stats::qnorm(cl) * se[3])
      chisq <- stats::prop.test(x = c(sum(var1), sum(var2)),
                                n = c(length(var1), length(var2)),
                                conf.level = conf.level, alternative = alternative, correct = correct)
      zstat <- as.numeric(format(sign(est_diff)*sqrt(chisq$statistic), 
                                 digits = digits))
      pval <- as.numeric(format(chisq$p.value, 
                                digits = digits))
      cil <- as.numeric(format(cil, digits = digits))
      cih <- as.numeric(format(cih, digits = digits))
      se <- as.numeric(format(se, digits = digits))
      
      main <- matrix(c(paste(myargs[3], " = ", byt[1], 
                              sep = ""), ns[1], msg[1], est1, se[1], 
                        paste("[", cil[1], ", ", cih[1], 
                                        "]", sep = ""), 
                       paste(myargs[3], " = ", byt[2], sep = ""), ns[2], msg[2], est2, 
                        se[2], paste("[", cil[2], ", ", 
                                                 cih[2], "]", sep = ""), 
                      "Difference", 
                        ns[3],
                        msg[3],
                        est_diff,
                        se[3],
                        paste("[", cil[3], ", ", cih[3], "]", sep = "")), 
                      ncol = 6, byrow = TRUE)
      main <- data.frame(main)
      names(main) <- c("Group", "Obs", "Missing", 
                        "Mean", "Std. Err.", paste(conf.level * 
                                                                  100, "% CI", sep = ""))
      row.names(main) <- c("", " ", "  ")
      
    }
    par <- c(null.hypoth = null.hypoth, alternative = alternative, 
             conf.level = conf.level, exact = exact, correct = correct, digits = digits)
    invisible(list(tab = main, zstat = zstat, pval = pval, 
                   var1 = var1, var2 = var2, by = by, par = par))
  }
    
    
  myargs <- c(deparse(substitute(var1)), deparse(substitute(var2)), 
              deparse(substitute(by)))
  proptest.obj <- proptest.do(var1 = var1, var2 = var2, 
                              by = by, exact = exact,
                              null.hypoth = null.hypoth, 
                              alternative = alternative, 
                              conf.level = conf.level,
                              more.digits = more.digits, 
                              correct = correct,
                              myargs = myargs)
  
  proptest.obj$call <- match.call()
  class(proptest.obj) <- "proptest"
  return(proptest.obj)
}

Try the rigr package in your browser

Any scripts or data that you put into this service are public.

rigr documentation built on Sept. 7, 2022, 1:05 a.m.