R/rss.sampling.R
In generalRSS: Statistical Tools for Balanced and Unbalanced Ranked Set Sampling

Documented in rss.sampling

#' @details This function performs balanced or unbalanced ranked set sampling from a given data set. The length of the sample allocation vector (nsamp) must match the set size (H). The subject ID, outcome variable (Y), and auxiliary variable (X) must have the same length. If Y is not provided (Y=NULL), the function returns only the sampled IDs and their ranks without generating Y values.
#' @title Generate ranked set samples
#' @name rss.sampling
#' @description The rss.sampling function generates ranked set samples by performing ranked set sampling directly on a given population data set using an auxiliary variable (X) and subject IDs. The outcome variable (Y) is optional. If Y=NULL, the function returns only the sampled IDs and ranks.
#'
#' @param ID A numeric vector of subject IDs from the population. IDs must be unique.
#' @param Y A numeric vector of interested outcome variable. If Y=NULL (default), only IDs and ranks are returned.
#' @param X A numeric vector of auxiliary variable used for ranking. Must have the same length as ID.
#' @param H The RSS set size
#' @param nsamp A numeric vector specifying the sample allocation for each stratum.
#'
#' @return A data frame with the following columns:
#' \item{ID}{The sampled subjects' IDs.}
#' \item{rank}{The rank information assigned to each sample.}
#' \item{y}{The generated ranked set samples of the outcome variable Y. If Y=NULL, this column is not included.}
#' @examples
#' ## Example 1: Balanced RSS with equal sample sizes.
#' data(iris)
#' id=1:nrow(iris)
#' rss.data=rss.sampling(ID=id, Y=iris$Sepal.Length, X=iris$Petal.Length, H=3,nsamp=c(6,6,6))
#'
#' ## Example 2: Unbalanced RSS with different sample sizes.
#' rss.data=rss.sampling(ID=id, Y=iris$Sepal.Length, X=iris$Petal.Length, H=3, nsamp=c(6,10,8))
#'
#' # Check the structure of the RSS data
#' colnames(rss.data) # include "ID", "rank", and "Y"
#' head(rss.data$ID)
#' head(rss.data$rank)
#'
#' ## Example 3: If Y is not available, retrieve sampled IDs and ranks only.
#' rss.data=rss.sampling(ID=id, X=iris$Petal.Length, H=3,nsamp=c(6,10,8))
#'
#' # Check the structure of the RSS data
#' colnames(rss.data) # include "ID" and "rank"
#' head(rss.data$ID)
#' head(rss.data$rank)
#' head(rss.data$y)
#'
#' @export
rss.sampling <- function(ID, Y=NULL, X, H, nsamp)
{
  if (any(duplicated(ID))) stop("ID must be unique.", call. = FALSE)
  if(length(ID)!= length(X)) stop("ID and X must have the same length.", call.=F)
  if (!is.null(Y) && length(ID) != length(Y)) stop("ID and Y must have the same length if Y is provided.", call. = FALSE)

  n=sum(nsamp)
  if(is.null(Y)){
    p=2
  }else{p=3}
  data=matrix(0,n,p)

  if(H != length(nsamp)) stop("Set size are different with the length of sample allocations", call. = F)

  idx=sample(ID,H*n,replace=F)
  ind=NULL
  for(i in 1:length(idx)){
      ind=c(ind, which(idx[i]==ID))
  }

  if(!is.null(Y)){
    Y.sel <- matrix(Y[ind],ncol=H)
  }
  X.sel <- matrix(X[ind],ncol=H)
  ID.sel <- matrix(ID[ind],ncol=H)

  for(h in (1:H)){
    if(nsamp[h]!=0){
      for(i in (1:nsamp[h])){
        k=i
        if(h>1){
          k=i+sum(nsamp[1:(h-1)])
        }
        rdata=rank(X.sel[k,],ties.method='first')

        data[k,1]=h
        data[k,2]=ID.sel[k,rdata==h]
        if(p==3){
          data[k,3]=Y.sel[k,rdata==h]
        }
      }
    }
  }

  if(!is.null(Y)){
    colnames(data) <- c("rank","ID","y")
  }else{
    colnames(data) <- c("rank","ID")
  }
  data<-as.data.frame(data)
  return(data)
}