R/rss.t.test.R
In generalRSS: Statistical Tools for Balanced and Unbalanced Ranked Set Sampling

Documented in rss.t.test

#' @details This function performs a t-test on ranked set sample data for both one-sample and two-sample mean problems, using t approximations. For a one-sample test, provide `data1` as a data frame with `rank` and `y` columns. For a two-sample test, provide both `data1` and `data2` with equal set sizes. The `method` parameter allows for two options to approximate the t-distribution: "sample" and "naive" as introduced by Ahn et al. (2014). The function compute the t-statistic, confidence interval, degrees of freedom, and p-value based on the provided RSS data and specified parameters.
#' @title RSS t-test for one-sample and two-sample problems
#' @name rss.t.test
#' @description The rss.t.test function performs one- and two-sample t-tests on ranked set sample data using t approximations, with methods described by Ahn et al. (2014).
#'
#' @param data1 A numeric data frame of ranked set samples with columns `rank` for ranks and `y` for data values.
#' @param data2 An optional numeric data frame of ranked set samples with columns `rank` for ranks and `y` for data values.
#' @param alpha A numeric value specifying the confidence level for the interval.
#' @param alternative A character string specifying the alternative hypothesis. Must be one of "two.sided" (default), "greater", or "less".
#' @param mu0 A numeric value indicating the hypothesized value of the mean (for a one-sample problem) or the mean difference (for a two-sample problem).
#' @param method A character string specifying the method used to approximate the t-distribution. Must be either "sample" or "naive".
#'
#' @return
#'   \item{RSS_mean}{The RSS mean estimate (for a one-sample problem) or a vector of RSS mean estimates for each group (for a two-sample problem).}
#'   \item{CI}{The confidence interval for the population mean (for a one-sample problem) or for the mean difference (for a two-sample problem).}
#'   \item{t}{The t-statistic for the test.}
#'   \item{df}{The degrees of freedom for the test.}
#'   \item{p.value}{The p-value for the test.}
#'
#' @references
#' S. Ahn, J. Lim, and X. Wang. (2014) The student’s t approximation to distributions of pivotal statistics from ranked set samples. Journal of the Korean Statistical Society, 43, 643–652.
#'
#' @seealso
#' \code{\link{rss.simulation}}: used for simulating Ranked Set Samples (RSS), which can serve as input.
#'
#' \code{\link{rss.sampling}}: used for sampling Ranked Set Samples (RSS) from a population data set, providing input data.
#'
#' @importFrom methods is
#' @examples
#' ## Balanced RSS with a set size 3 and equal sample sizes of 6 for each stratum,
#' ## using imperfect ranking from a normal distribution with a mean of 0.
#' rss.data1=rss.simulation(H=3,nsamp=c(6,6,6),dist="normal", rho=0.8,delta=0)
#'
#' ## one-sample t-test using 'naive' method
#' rss.t.test(data1=rss.data1, data2=NULL, alpha=0.05,
#' alternative="two.sided", mu0=0, method="naive")
#'
#' ## one-sample t-test using 'sample' method
#' rss.t.test(data1=rss.data1, data2=NULL, alpha=0.05,
#' alternative="two.sided", mu0=0, method="sample")
#'
#' ## Unbalanced RSS with a set size 3 and different sample sizes of 6, 10, and 8 for each stratum,
#' ## using imperfect ranking from a normal distribution with a mean of 0.
#' rss.data2<-rss.simulation(H=3,nsamp=c(6,8,10),dist="normal", rho=0.8,delta=0)
#'
#' ## two-sample t-test using 'naive' method
#' rss.t.test(data1=rss.data1, data2=rss.data2, alpha=0.05,
#' alternative="two.sided", mu0=0, method="naive")
#'
#' ## two-sample t-test using 'sample' method
#' rss.t.test(data1=rss.data1, data2=rss.data2, alpha=0.05,
#' alternative="two.sided", mu0=0, method="sample")

#' @export
rss.t.test <- function(data1, data2=NULL, alpha=0.05, alternative="two.sided", mu0=0, method)
{

  alternative.set=c("two.sided", "less", "greater")
  if(!alternative %in% alternative.set) stop("Invalid alternative selected. Please choose from 'two.sided', 'less', or 'greater'.")

  method.set=c("sample", "naive")
  if(!method %in% method.set) stop("Invalid alternative selected. Please choose from 'sample', 'naive'.")

  if( (alpha > 0) & (alpha < 1)){

    if(is(data2)[1] == "NULL"){
      if(!all(c("rank", "y") %in% colnames(data1))) {
        stop("The input data must contain 'rank' and 'y' variables.")
      }
      data = data1
      H = length(unique(data$rank))
      nsamp = table(data$rank)

      rss.mu=mean(tapply(data$y,data$rank,mean))
      varh = tapply(data$y,data$rank,stats::var)
      rss.sd = sqrt(sum(varh/nsamp)/H^2)
      tstat = (rss.mu - mu0)/rss.sd

      if(method == "sample"){
        nu = sum(varh/nsamp)^2/sum(varh^2/nsamp^2/(nsamp-1))
      }else if(method == "naive"){
        nu = sum(nsamp)-H
      }

      CI.up = rss.mu + stats::qt(1-alpha/2,nu)*rss.sd
      CI.low = rss.mu - stats::qt(1-alpha/2,nu)*rss.sd

      if(alternative == "two.sided"){
        pval = 2*(1-stats::pt(abs(tstat),nu))
      }else if(alternative == "less"){
        pval = stats::pt(tstat,nu)
      }else if(alternative == "greater"){
        pval = 1-stats::pt(tstat,nu)
      }
      result = list(RSS_mean = rss.mu, CI = c(CI.low, CI.up), t = tstat, df=nu, p.value = pval)
      return(result)
    }

    if(is(data2)[1] != "NULL"){
      if (!all(c("rank", "y") %in% colnames(data1))) {
        stop("The first input data must contain 'rank' and 'y' variables.")
      }
      if (!all(c("rank", "y") %in% colnames(data2))) {
        stop("The second input data must contain 'rank' and 'y' variables.")
      }
      H1 = length(unique(data1$rank))
      nsamp1 = table(data1$rank)
      H2 = length(unique(data2$rank))
      nsamp2 = table(data2$rank)
      n1 = sum(nsamp1) ; n2 = sum(nsamp2)

      if(H1 != H2) stop("Set sizes are different.", call. = F)
      H=H1
      rss.mu1 = mean(tapply(data1$y,data1$rank,mean))
      rss.mu2 = mean(tapply(data2$y,data2$rank,mean))
      rss.diff = rss.mu1-rss.mu2
      varh1 = tapply(data1$y,data1$rank,stats::var)
      varh2 = tapply(data2$y,data2$rank,stats::var)
      pvarh = ((n1-1)*varh1 + (n2-1)*varh2)/(n1+n2-2)

      rss.sd = sqrt(sum((1/n1+1/n2)*pvarh))/H
      tstat = (rss.diff - mu0)/rss.sd

      if(method == "sample"){
        nom = ( sum( (1/n1+1/n2) * pvarh ) )^2
        denom = sum( (1/n1+1/n2)^2 * pvarh^2 / (nsamp1+nsamp2-2) )
        nu = nom/denom
      }else if(method == "naive"){
        nu = sum(nsamp1) + sum(nsamp2) - 2*H
      }

      CI.up = rss.diff + stats::qt(1-alpha/2,nu)*rss.sd
      CI.low = rss.diff - stats::qt(1-alpha/2,nu)*rss.sd

      if(alternative == "two.sided"){
        pval = 2*(1-stats::pt(abs(tstat),nu))
      }else if(alternative == "less"){
        pval = stats::pt(tstat,nu)
      }else if(alternative == "greater"){
        pval = 1-stats::pt(tstat)
      }
      result = list(RSS_mean = c(rss.mu1,rss.mu2), CI = c(CI.low, CI.up), t = tstat, df=nu, p.value = pval)
      return(result)
    }

  }else stop("alpha is out of bound.", call. = F)
}