R/Prop_test.R
In lessR: Less Code, More Results

Prop_test <-
function (variable=NULL, success=NULL, by=NULL, data=d,
          n_succ=NULL, n_fail=NULL, n_tot=NULL, n_table=NULL,
          Yates=FALSE, pi=NULL, digits_d=3, ...) {

  # a dot in a parameter name to an underscore
  dots <- list(...)
  if (!is.null(dots)) if (length(dots) > 0) {
    for (i in 1:length(dots)) {
      if (names(dots)[i] == "p0") pi <- dots[[i]]
    }
  }

  # length(x) is count_n(x) + count_NA(x)
  count_n <- function(x) sum(!is.na(x))
  count_NA <- function(x) sum(is.na(x))

  # classify analysis by (a) if data or freqs and (b) by type
  # ---------------------------------------------------------
  # data or not
  do_data <- is.null(n_succ) && is.null(n_tot) && is.null(n_table) 

  # type of analysis
  p1 <- FALSE  # analyze a single proportion
  pm <- FALSE  # compare proportions over groups
  gf <- FALSE  # goodness-of-fit
  ct <- FALSE  # cross-tabs independence
  # ---------------------------------------------------------

  # option to provide raw data from which to compute proportion information
  # two potential variables: variable and by
  # variable is required if entering data
  if (do_data) {
    data.vars <- as.list(seq_along(data))
    names(data.vars) <- names(data)
    ind.var <- eval(substitute(variable), envir=data.vars, parent.frame())
    nm.var <- names(data)[ind.var]
    if (!missing(by)) {
      null_by <- FALSE    
      ind.grp <- eval(substitute(by), envir=data.vars, parent.frame())
      nm.grp <- names(data)[ind.grp]
    }
    else
      null_by <- TRUE

    # discover the type of do_data analysis
    if (!is.null(success) && null_by) p1 <- TRUE
    if (!is.null(success) && !null_by) pm <- TRUE
    if (is.null(success) && null_by) gf <- TRUE
    if (!is.null(n_table)) ct <- TRUE
    if (is.null(success) && !null_by) ct <- TRUE
  }  # end do_data

  # provide proportion information directly
  else {
    if (!is.null(n_succ) && (is.null(n_tot) && is.null(n_fail)))  {
      cat("\n"); stop(call.=FALSE, "\n","------\n",
        "When providing the number of successes, need the\n",
        "  corresponding number of trials, n_tot, or failures, n_fail.\n\n")
    }
    miss.n_table <- ifelse (is.null(n_table), TRUE, FALSE)

    if (!is.null(n_fail)) n_tot <- n_succ + n_fail
    if (length(n_succ) == 1 && !is.null(n_tot)) p1 <- TRUE
    if (length(n_succ) > 1 && !is.null(n_tot)) pm <- TRUE
    if (is.null(n_succ) && miss.n_table) gf <- TRUE
    if (!is.null(n_table)) ct <- TRUE
  }


  # --------------
  # one proportion
  if (p1) {  

    if (do_data) {
      n_NA <- count_NA(data[,ind.var])
      n_tot <- count_n(data[,ind.var])
      n_succ <- length(data[data[,ind.var]==success, ind.var])
    }

    if (missing(pi)) pi <- 0.5
    out <- binom.test(n_succ, n_tot, p=pi, ...)
    p.sample <- .fmt(out$estimate, digits_d)
    alt <- out$alternative

    cat("\n")
    cat(paste("<<<", out$method, "of a proportion", "\n\n"))
    if (do_data) {
      cat("variable:", nm.var, "\n")
      cat("success:", success, "\n")
      cat("\n")
    }
    cat("------ Describe ------\n\n")
    if (do_data) cat("Number of missing values:", n_NA, "\n")
    cat("Number of successes:", out$statistic, "\n")
    cat("Number of failures:", n_tot - n_succ, "\n")
    cat("Number of trials:", out$parameter, "\n")
    cat("Sample proportion:", p.sample, "\n")
    cat("\n")

    cat("------ Infer ------\n\n")
    if (alt != "two.sided") 
      cat("Alternative hypothesis: Population proportion is", alt, "than",
          pi, "\n")
    cat("Hypothesis test for null of ", pi, ", p-value: ",
        .fmt(out$p.value, digits_d), sep="", "\n")
    cat("95% Confidence interval:", .fmt(out$conf.int[1], digits_d), "to",
        .fmt(out$conf.int[2], digits_d), "\n")

    return(invisible(out))
  }  # end p1


  # --------------------
  # multiple proportions
  if (pm) {

    if (do_data) {
      # if more than 2 levels, reduce to 2 levels
      if (length(unique(data[,ind.var])) > 2) {  
        for (i in 1:nrow(data))
          data[i,ind.var] <- ifelse (data[i,ind.var]==success, success, "fail")
      }

      # two-column table, Successes and Failures
      tbl <- table(data[,ind.grp], data[,ind.var]) 

      # if needed, re-order the two tbl columns, success in 1st column
      ind <- which(colnames(tbl) == success)
      if (ind != 1) {
        tbln <- tbl
        tbln[,1] = tbl[,2,drop=FALSE]
        tbln[,2] = tbl[,1,drop=FALSE]
        colnames(tbln)[1] = colnames(tbl)[2]
        colnames(tbln)[2] = colnames(tbl)[1]
        tbl <- tbln
      }
    }  # end do data

    else {  # construct the table from given proportion info
      n_fail <- n_tot - n_succ
      m <- matrix(c(n_succ, n_fail), byrow=FALSE, ncol=2)
      if (is.null(names(n_succ)))
        rn <-  1:nrow(m)
      else
        rn <- names(n_succ)
      dimnames(m) <- list(rn, c("Success", "Fail"))
      tbl <- as.table(m)
    }

    # do the analysis
    out <- prop.test(tbl, correct=Yates, ...)
    rn <- rownames(tbl)
    for (i in 1:length(rn))
      names(out$estimate)[i] <- rownames(tbl)[i]

    cat("\n")
    cat(paste("<<<", out$method, "\n\n"))
    if (do_data) {
      cat("variable:", nm.var, "\n")
      cat("success:", success, "\n")
      cat("by:", nm.grp, "\n")
    }

    cat("\n")
    cat("--- Description")
    ns <- .fmt(tbl[,1], 0)
    nt <- .fmt(rowSums(tbl), 0)
    vals <- t(.fmt(out$estimate, digits_d))
    vals <- as.vector(vals)
    m <- matrix(c(ns, nt, vals), byrow=TRUE, ncol=length(ns))
    k <- data.frame(m)
    names(k) <- names(out$estimate)
    nm1 <- paste("n_", success, sep="")
    row.names(k) <- c(nm1, "n_total", "proportion")
    k <- kable(k, format="pandoc", align="r")
    print(k)

    cat("\n")
    cat("--- Inference\n\n")
    cat("Chi-square statistic:", .fmt(out$statistic, digits_d), "\n")
    cat("Degrees of freedom:", .fmt(out$parameter, 0), "\n")
    cat("Hypothesis test of equal population proportions: p-value = ",
        .fmt(out$p.value,digits_d), sep="", "\n")

    return(invisible(out))
  }


  # ---------------
  # goodness-of-fit
  if (gf) {

    if (do_data) {
      tbl <- table(data[,ind.var])
      nms <- names(tbl)
    }
    else {
      tbl <- n_tot
      if (is.null(names(n_tot)))
        nms <-  1:length(n_tot)
      else
        nms <- names(n_tot)
    }

    # do the analysis
    out <- chisq.test(tbl, ...)

    obs <- .fmt(out$observed, 0)
    exp <- .fmt(out$expected, digits_d)
    res <- .fmt(out$residuals, digits_d)
    std <- .fmt(out$stdres, digits_d)

    cat("\n")
    cat(paste("<<<", out$method, "\n\n"))
    if (do_data)
      cat("variable:", nm.var, "\n")

    cat("\n")
    cat("--- Description")
    m <- matrix(c(obs,exp,res,std), byrow=TRUE, ncol=length(tbl))
    k <- data.frame(m)
    names(k) <- nms
    row.names(k) <- c("observed", "expected", "residual", "stdn res")
    k <- kable(k, format="pandoc", align="r")
    print(k)

    cat("\n")
    cat("--- Inference\n\n")
    cat("Chi-square statistic:", .fmt(out$statistic, digits_d), "\n")
    cat("Degrees of freedom:", .fmt(out$parameter, 0), "\n")
    cat("Hypothesis test of equal population proportions: p-value = ",
        .fmt(out$p.value,digits_d), sep="", "\n")

    return(invisible(out))
  }  # end gf


  # -----------------------
  # cross-tabs independence
  if (ct) {

    if (do_data) {
      tbl <- table(data[,ind.grp], data[,ind.var])
      names(attributes(tbl)$dimnames) <- c(nm.grp, nm.var)

      cat("variable:", nm.var, "\n")
      cat("by:", nm.grp, "\n")
    }  # end do_data

    else {  # freq input
      # read the cross-tab table
      if (nchar(n_table) == 0) {
        n_table <- file.choose()
        cat("File: ", n_table, "\n\n")
      }
      if (grepl(".csv$", n_table) || grepl(".txt$", n_table)) 
        tbl <- read.csv(n_table, header=FALSE)
      else if (grepl(".xlsx$", n_table)) 
        tbl <- openxlsx::read.xlsx(n_table)
      else if (grepl("FreqTable99$", n_table)) { 
        f.nm <- "dataFreqTable99.rda"
        path.name <- paste(find.package("lessR"), "/data/",  f.nm, sep="")
        x.env <- new.env()  # scratch environment
        load(path.name, envir=x.env)
        dname <- "dataFreqTable99"
        tbl <- get(dname, pos=x.env)
      }
      else {
      cat("\n"); stop(call.=FALSE, "\n","------\n",
          "Frequency table file must be of file type\n",
          "  .csv for a Comma Separated Value text file or\n",
          "  .xlsx for an Excel file.\n\n")
      }
      nms <- 1:ncol(tbl)
      suppressWarnings(names(tbl) <- rep("", ncol(tbl)))
    }  # end freq input

    # analysis
    out <- chisq.test(tbl, correct=Yates, ...)   

    n.cases <- sum(rowSums(tbl))
    min_rc <- min(nrow(tbl)-1, ncol(tbl)-1)
    V <- sqrt(out$statistic / (min_rc * n.cases))
    txt <- ifelse(out$parameter == 1, " (phi)", "") 
    txt <- paste("Cramer\'s V", txt, ":", sep="")

    out <- chisq.test(tbl, correct=Yates, ...)
    obs <- .fmt(out$observed, 0)
    exp <- .fmt(out$expected, digits_d)
    res <- .fmt(out$residuals, digits_d)
    std <- .fmt(out$stdres, digits_d)

    k <- data.frame(row.names=1:(nrow(tbl)*ncol(tbl)))
    k[,1] <- rep(1:nrow(tbl), ncol(tbl))
    k[,2] <- rep(1:ncol(tbl), each=nrow(tbl))
    k[,3] <- obs; k[,4] <- exp;  k[,5] <- res;  k[,6] <- std
    names(k) <- c("Row", "Col", "Observed", "Expected", "Residual", "Stnd Res")
    k$Residual <- as.numeric(k$Observed) - as.numeric(k$Expected)
    k <- k[order(k[,1]),]  # display by row

    cat("\n")
    cat(paste("<<<", out$method, "\n\n"))

    cat("--- Description\n\n")
    if (do_data)
      print(addmargins(tbl))
    else {
      cat("Cell Frequencies")
      print(tbl, row.names=FALSE)
    }
    cat("\n", txt, .fmt(V,3), "\n\n")
    print(k, row.names=FALSE)

    cat("\n")
    cat("--- Inference\n\n")
    cat("Chi-square statistic:", .fmt(out$statistic, digits_d), "\n")
    cat("Degrees of freedom:", .fmt(out$parameter, 0), "\n")
    cat("Hypothesis test of equal population proportions: p-value = ",
        .fmt(out$p.value,digits_d), sep="", "\n")

    return(invisible(out))

  }  # end ct

}