R/kappa.R
In kappaGold: Agreement of Nominal Scale Raters (with a Gold Standard)

Documented in kappam_gold simulKappa

TOL <- 1.5e-8

#' Cohen's kappa for nominal data
#'
#' Cohen's kappa is the classical agreement measure when two raters provide
#' ratings for subjects on a nominal scale.
#'
#' The data of ratings must be stored in a two column object, each rater is a
#' column and the subjects are in the rows. Every rating category is used and
#' the levels are sorted. Weighting of categories is currently not implemented.
#' 
#' @examples
#' # 2 raters have assessed 4 subjects into categories "A", "B" or "C"
#' # organize ratings as two column matrix, one row per subject rated
#' m <- rbind(sj1 = c("A", "A"),
#'            sj2 = c("C", "B"),
#'            sj3 = c("B", "C"),
#'            sj4 = c("C", "C"))
#'            
#' # Cohen's kappa -----
#' kappa2(ratings = m)
#' 
#' # robust variant ---------
#' kappa2(ratings = m, robust = TRUE)
#'
#' @param ratings matrix (dimension nx2), containing the ratings as subjects by
#'   raters
#' @param robust flag. Use robust estimate for random chance of agreement by
#'   Brennan-Prediger?
#' @param ratingScale Possible levels for the rating. Or `NULL`.
#' @returns list containing Cohen's kappa agreement measure (value) or `NULL` if
#'   no valid subjects
#' @seealso [irr::kappa2()]
#' @export
kappa2 <- function (ratings, robust = FALSE, ratingScale = NULL) {
  ratings <- as.matrix(ratings)
  # handle ratingScale first, before dropping incomplete observations
  if (is.null(ratingScale)) {
    # sort also drops NAs (due to na.last=NA)
    ratingScale <- sort(as.character(unique(c(ratings[,1L], ratings[,2L]))), na.last = NA)
  } else {
    if (anyDuplicated(ratingScale)) {
      stop("Duplicated entries in rating scale!", call. = FALSE)
    }#fi
    ratingScale <- as.character(ratingScale)
    
    if (!all(as.character(ratings[!is.na(ratings)]) %in% ratingScale)) {
      stop("Ratings ", paste0(unique(as.character(ratings)), collpase="*"),
           " do not match provided rating scale ", paste0(ratingScale, collapse="*"), call. = FALSE)
    }#fi
  }#esle
  
  nCat <- length(ratingScale)
  if (nCat <= 1L) {
    stop("Rating scale needs **at least two** levels!", call. = FALSE)
  }#fi
  
  # check assumptions
  if (NCOL(ratings) != 2L) {
    stop("Please provide **exactly two** raters!", call. = FALSE)
  }
  
  # remove subjects with NA ratings
  ratings <- stats::na.omit(ratings)
  
  nSj <- NROW(ratings) # nbr subjects
  if (nSj < 1) {
    return(NULL)
  }
  
  
  # cross-table of two raters
  rtab <- table(factor(ratings[, 1L], levels = ratingScale),
                factor(ratings[, 2L], levels = ratingScale))
  
  rtab.rs <- .rowSums(rtab, m = nCat, n = nCat)
  rtab.cs <- .colSums(rtab, m = nCat, n = nCat)
  
  # agreement as found on diagonal elements
  agreeP <- sum(diag(rtab)) / nSj 
  chanceP <- if (robust) {
    # uniform
    1/nCat
  } else {
    # marginal proportions
    crossprod(rtab.rs, rtab.cs)[1L] / nSj^2
  }
  
  # Cohen's kappa for 2 raters
  k2 <- agreeP - chanceP
  # normalization if numerator and denominator != 0
  #>agreeP = chanceP is always kappa=0 (no need to normalize)
  #>chanceP=1 => denominator 0
  if (abs(k2) > TOL && chanceP < 1) {
    k2 <- k2 / (1L - chanceP)
  }
  
  # standard error
  # cf Large Sample Standard Errors of Kappa ... (Fleiss, 1969)
  pRS <- rtab.rs / nSj
  pCS <- rtab.cs / nSj
  ptab0 <- rtab / nSj; diag(ptab0) <- 0
  dsTerm <- 0
  for (i in seq_along(ratingScale)) {
    for (j in seq_along(ratingScale)) {
      dsTerm <- dsTerm + ptab0[i,j] * (pCS[[i]] + pRS[[j]])^2
    }#rof j
  }#rof i
  
  varK2 <- 1/(nSj*(1-chanceP)^4) * (
    crossprod(diag(rtab)/nSj, ((1-chanceP) - (pCS + pRS) * (1-agreeP))^2)[1L] +
      (1-agreeP)^2 * dsTerm - (agreeP*chanceP - 2 * chanceP + agreeP)^2)
  
  # return
  list(
    method = "Cohen's Kappa for two Raters",
    subjects = nSj, raters = 2, categories = nCat,
    robust = robust,
    agreem = agreeP,
    value = k2,
    #XXX currently, SE only implemented for standard Cohen's Kappa
    se = if (!robust && is.finite(varK2) && varK2 >= 0) sqrt(varK2) else NA_real_
  )
}


#' Fleiss' kappa for multiple nominal-scale raters
#' 
#' When multiple raters judge subjects on a nominal scale we can assess their agreement with Fleiss' kappa.
#' It is a generalization of Cohen's Kappa for two raters and there are different variants how to assess chance agreement.
#' 
#' Different **variants** of Fleiss' kappa are implemented.
#' By default (`variant="fleiss"`), the original Fleiss Kappa (1971) is calculated, together with an asymptotic standard error and test for kappa=0.
#' It assumes that the raters involved are not assumed to be the same (one-way ANOVA setting).
#' The marginal category proportions determine the chance agreement.
#' Setting `variant="conger"` gives the variant of Conger (1980) that reduces to Cohen's kappa when m=2 raters. 
#' It assumes identical raters for the different subjects (two-way ANOVA setting).
#' The chance agreement is based on the category proportions of each rater separately.
#' Typically, the Conger variant yields slightly higher values than Fleiss kappa.
#' `variant="robust"` assumes a chance agreement of two raters to be simply 1/q, where q is the number of categories (uniform model).
#' 
#' @examples
#' # 4 subjects were rated by 3 raters in categories "1", "2" or "3"
#' # organize ratings as matrix with subjects in rows and raters in columns
#' m <- matrix(c("3", "2", "3",
#'               "2", "2", "1",
#'               "1", "3", "1",
#'               "2", "2", "3"), ncol = 3, byrow = TRUE)
#' kappam_fleiss(m)
#'
#' # show category-wise kappas -----
#' kappam_fleiss(m, detail = TRUE)
#'
#' @param ratings matrix (subjects by raters), containing the ratings
#' @param variant Which variant of kappa? Default is Fleiss (1971). Other options are Conger (1980) or robust variant.
#' @param detail Should category-wise Kappas be computed? Only available for the Fleiss (1971) variant.
#' @param ratingScale Specify possible levels for the rating. Default `NULL` means to use all unique levels from the sample.
#' @returns list containing Fleiss's kappa agreement measure (value) or `NULL` if no subjects
#' @seealso [irr::kappam.fleiss()]
#' @export
kappam_fleiss <- function (ratings, variant = c("fleiss", "conger", "robust", "uniform"),
                           detail = FALSE, ratingScale = NULL) {
  variant <- match.arg(variant)
  
  ratings <- as.matrix(ratings)
  
  # handle ratingScale first, before dropping incomplete observations
  if (is.null(ratingScale)) {
    # sort also drops NAs (due to na.last=NA)
    ratingScale <- sort(unique(as.character(ratings)), na.last = NA)
  } else {
    if (anyDuplicated(ratingScale)) {
      stop("Duplicated entries in rating scale!", call. = FALSE)
    }
    ratingScale <- as.character(ratingScale)
    
    if (!all(as.character(ratings[!is.na(ratings)]) %in% ratingScale)) {
      stop("Ratings ", paste0(unique(as.character(ratings)), collpase="*"),
           " do not match provided rating scale ", paste0(ratingScale, collapse="*"), call. = FALSE)
    }
  }
  nCat <- length(ratingScale)
  if (nCat <= 1L) {
    stop("Rating scale needs **at least two** levels!", call. = FALSE)
  }
  
  # drop raters with only NA-ratings
  # using `drop = FALSE` important to stay matrix,
  #+even in edge cases with single col/row
  ratings <- ratings[, colSums(!is.na(ratings)) >= 1L, drop = FALSE]
  # drop subjects that are rated not at all or only once
  ratings <- ratings[rowSums(!is.na(ratings)) >= 2L, , drop = FALSE]
  
  nSj <- NROW(ratings)
  nr <- NCOL(ratings)
  
  
  if (nSj <= 0) {
    #warning("No subjects left!", call. = FALSE)
    return(NULL)
  }
  
  if (nr <= 0) {
    #warning("No raters left!", call. = FALSE)
    return(NULL)
  }
  
  method <- switch(variant, 
                   fleiss = "Fleiss' Kappa for m Raters",
                   conger = "Fleiss' Kappa for m Raters (Conger variant)",
                   uniform =,
                   robust = "Fleiss' Kappa for m Raters (robust variant)",
                   stop("Unknown variant of Fleiss kappa!", call. = FALSE)
  )
  
  
  # count nbr of rated categories per subject (nSj x nCat)
  tab_cnt_sj <- t(apply(ratings, 1,
                        FUN = function(ro) tabulate(factor(ro, levels = ratingScale), nbins = nCat)))
  rt_cnt_sj <- .rowSums(tab_cnt_sj, m = nSj, n = nCat)
  
  # build return value
  rval <- list(method = method,
               subjects = nSj, raters = nr, categories = nCat,
               ratings = sum(tab_cnt_sj), balanced = isTRUE(stats::sd(rt_cnt_sj) == 0))
  
  # @param idx index vector of rows to use
  kappam_fleiss0 <- function(idx) {
    idxl <- length(idx)
    nRatings <- sum(tab_cnt_sj[idx,])
    cat_cnt <- .colSums(tab_cnt_sj[idx,], m = idxl, n = nCat)
    cat_ssq <- crossprod(cat_cnt)[1L]
    # prop of concordant pairs per subject:
    P_sj <- 1/(rt_cnt_sj[idx] - 1) * (.rowSums(tab_cnt_sj[idx,]^2, m = idxl, n = nCat)/rt_cnt_sj[idx] - 1)
    agreeP <- stats::weighted.mean(P_sj, w = rt_cnt_sj[idx])
    #XXX weighting? # or simply mean(P_sj)
    #+P_sj are proportions, based on denominator rt_cnt_sj * (rt_cnt_sj-1)
    #+
    #+balanced data used for agreeP:
    #sum((colSums(sj_cnt_tab^2) - nr) / (nr * (nr - 1) * nSj)) #or
    #mean((colSums(sj_cnt_tab^2) - nr)) / (nr * (nr - 1))
    
    chanceP <- switch(variant, 
                      fleiss = cat_ssq / nRatings^2,
                      conger = local({
                        # counts of rated categories per rater (nr x nCat)
                        tab_cnt_rt <- t(apply(ratings[idx,], 2,
                                              FUN = function(co) tabulate(factor(co, levels = ratingScale), nbins = nCat)))
                        tab_prop_rt <- proportions(tab_cnt_rt, margin = 1)
                        # Conger (1980), p. 325 divides the correction term by (nr-1) (not by nr)
                        #+but (nr-1) leads to different results when I compare with his derivation in an balanced example
                        #+Formula 1:
                        #cat_ssq / nRatings^2 - sum(apply(tab_prop_rt, 2, stats::var)) / nr
                        
                        # For Conger, chanceP is "average proportion of raters who agreed on the classification of each subject"
                        #+He compares all pairs of raters, -- which is well-founded in his balanced setting as each rater rates all subjects
                        #+For the unbalanced setting, it might be that two raters do not share any subject.
                        #+But that might not be a problem if we assume exchangeable subjects (and raters)
                        #+Formula 2:
                        #pagr_mat <- tcrossprod(tab_prop_rt)
                        #sum(pagr_mat[lower.tri(pagr_mat)]) / sum(lower.tri(pagr_mat))
                        
                        #+Formulas 1 & 2 agree for balanced setting but are slightly different in unbalanced settings
                        #+I use Formula 2 because it seems more robust
                        pagr_mat <- tcrossprod(tab_prop_rt)
                        sum(pagr_mat[lower.tri(pagr_mat)]) / sum(lower.tri(pagr_mat))
                      }),
                      uniform =,
                      robust = {
                        # prop. of two raters being concordant
                        1 / nCat # XXX is it that simple?
                        # XXX could use observed nbr of ratings per subject (in particular when number of raters varies per subj)
                        #+and use same procedure as for P_sj
                        #+only using expected r_ij under 1/q assumption.
                        #+In my examples, it was close (but little lower) than 1/q
                      },
                      stop("Unknown variant of Fleiss kappa!", call. = FALSE)
    )
    
    rval0 <- (agreeP - chanceP) / (1 - chanceP)
    attr(rval0, "agreeP") <- agreeP

    rval0    
  }#fn0
  
  
  val0 <- kappam_fleiss0(idx = seq_len(nSj))
  kappa_j <- victorinox(est = kappam_fleiss0, idx = seq_len(nSj))
  
  # bias corrected estimate
  val <- as.numeric(val0 - kappa_j$bias_j)
  # standard error
  se_j <- kappa_j$se_j
  
  u <- val / se_j
  p.val <- 2 * stats::pnorm(q = abs(u), lower.tail = FALSE)
  # add results to return value
  rval <- c(rval, list(
    agreem = attr(val0, "agreeP"), value0 = as.numeric(val0),
    value = val, se_j = se_j, stat.name = "z", statistic = u, p.value = p.val)
  )
  
  
  # SE0 and detail
  if (variant == "fleiss" && (nCat == 2 || rval$balanced)) {
    
    # avg number of ratings given per subject
    rt_cnt <- rval$ratings / nSj # == mean(rt_cnt_sj)
    pj <- .colSums(tab_cnt_sj, m = nSj, n = nCat) / rval$ratings #prop. of categories overall
    qj <- 1 - pj
    pj.qj <- crossprod(pj, qj)[1L]
    
    
    if (nCat == 2) {
      
      # cf Fleiss (2003), 18.3, p. 613 eq (18.46) for SE0 (for kappa=0)
      rt_cnt_H <- 1 / mean(1 / rt_cnt_sj) #harmonic mean
      pq <- pj[1L] * qj[1L] #== prod(pj)
      SEkappa0 <- sqrt(2 * (rt_cnt_H-1) + ((rt_cnt - rt_cnt_H) * (1 - 4 * pq)) / (rt_cnt * pq)) /
        ((rt_cnt - 1) * sqrt(nSj * rt_cnt_H))
      
      # Lipsitz, Laird and Brennan ("Simple moment estimates ..", 1994) propose SE for any kappa (not only SE0),
      #+using a estimating equation approach. See also "Estimating the K-coefficient from a select sample" (2001)
      #+They claim their approach is easy to enhance to nCat > 2
      
    } else {
      
      stopifnot(rval$balanced)
      # alternative kappa formula in this setting, cf Fleiss (2003), p.614, eq (18.50)
      # 1 - (nSj * rt_cnt^2 - sum(tab_sj_cnt^2)) / (nSj * rt_cnt * (rt_cnt - 1) * pj.qj)
      
      # cf Fleiss (2003) 18.3, p. 616
      SEkappa0 <- sqrt(2) / (pj.qj * sqrt(nSj * rt_cnt * (rt_cnt - 1))) * 
        sqrt(pj.qj^2 - crossprod(pj * qj, qj - pj)[1L]) #SE0
      
      
      # kappa for each category level dichotomized (level vs rest)?
      if (detail) {
        # XXX think of calculating detail also for un-balanced case and with other variants than Fleiss?!
        #cf Fleiss (2003) 18.3, p.614, eq (18.50)
        stopifnot(rval$balanced)
        value_j <- 1 - diag(crossprod(tab_cnt_sj, rt_cnt - tab_cnt_sj)) / (nSj * rt_cnt * (rt_cnt - 1) * pj * qj)
        # NaN to NA
        is.na(value_j) <- !is.finite(value_j)
        SEkappa0_j <- sqrt(2 / (nSj * rt_cnt * (rt_cnt - 1))) #SE0
        u_j <- value_j / SEkappa0_j
        #SEkappa0_j <- rep_len(SEkappa0_j, length(value_j))
        #is.na(SEkappa0_j) <- !is.finite(value_j)
        p.value_j <- 2 * (1 - stats::pnorm(abs(u_j)))
        detail_j <- cbind(kappa_j = value_j,
                          se0_j = SEkappa0_j,
                          z_j = u_j,
                          p.value_j = p.value_j)
        rownames(detail_j) <- ratingScale
        
        rval <- c(rval, list(detail = detail_j))
      } #fi detail
    } #esle
    
    u0 <- rval$value0 / SEkappa0
    p.value0 <- 2 * stats::pnorm(abs(u), lower.tail = FALSE) #two-sided
    
    rval <- c(rval,
              se0 = SEkappa0, statistic0 = u0, p.value0 = p.value0)
  }#fi fleiss-variant (2cat or balanced)
  
  rval
}



#' Agreement of a group of nominal-scale raters with a gold standard
#'
#' First, Cohen's kappa is calculated between each rater against the gold
#' standard which is taken from the 1st column by default. The average of these
#' kappas is returned as 'kappam_gold0'. The variant setting (`robust=`) is
#' forwarded to Cohen's kappa. A bias-corrected version 'kappam_gold' and a
#' corresponding confidence interval are provided as well via the jackknife
#' method.
#'
#' @examples
#' # matrix with subjects in rows and raters in columns.
#' # 1st column is taken as gold-standard
#' m <- matrix(c("O", "G", "O",
#'               "G", "G", "R",
#'               "R", "R", "R",
#'               "G", "G", "O"), ncol = 3, byrow = TRUE)
#' kappam_gold(m)
#'
#' @param ratings matrix. subjects by raters
#' @param refIdx numeric. index of reference gold-standard raters. Currently,
#'   only a single gold-standard rater is supported. By default, it is the 1st
#'   rater.
#' @param robust flag. Use robust estimate for random chance of agreement by
#'   Brennan-Prediger?
#' @param ratingScale Possible levels for the rating. Or `NULL`.
#' @param conf.level confidence level for confidence interval
#' @returns list. agreement measures (raw and bias-corrected) kappa with
#'   confidence interval. Entry `raters` refers to the number of tested raters,
#'   not counting the reference rater
#' @export
kappam_gold <- function(ratings, refIdx = 1, robust = FALSE, ratingScale = NULL,
                        conf.level = .95) {
  ratings <- as.matrix(ratings)
  stopifnot(is.numeric(refIdx), length(refIdx) >= 1,
            min(refIdx) >= 1, max(refIdx) <= NCOL(ratings))
  # for the time being, only a single gold-standard rater
  #XXX check consequences of more than one gold-standard rater
  stopifnot(length(refIdx) == 1)
  
  # handle ratingScale first, before dropping incomplete observations
  if (is.null(ratingScale)) {
    # sort also drops NAs (due to na.last=NA)
    ratingScale <- sort(unique(as.character(ratings)), na.last = NA)
  } else {
    if (anyDuplicated(ratingScale)) {
      stop("Duplicated entries in rating scale!", call. = FALSE)
    }
    ratingScale <- as.character(ratingScale)
    
    if (!all(as.character(ratings[!is.na(ratings)]) %in% ratingScale)) {
      stop("Ratings ", paste0(unique(as.character(ratings)), collpase="*"),
           " do not match provided rating scale ", paste0(ratingScale, collapse="*"), call. = FALSE)
    }
  }#esle
  nCat <- length(ratingScale)
  if (nCat <= 1L) {
    stop("Rating scale needs **at least two** levels!", call. = FALSE)
  }
  
  # gold standard
  subjGoldIdx <- which(!is.na(ratings[,refIdx]))
  if (!length(subjGoldIdx)) {
    stop("No subject with gold standard rating!", call. = FALSE)
  }
  
  # keep only subjects where gold-standard is given
  ratings <- ratings[subjGoldIdx,]
  # drop raters with only NA-ratings
  ratings <- ratings[, .colSums(!is.na(ratings),
                                m = NROW(ratings), n = NCOL(ratings)) >= 1L]
  
  nSj <- NROW(ratings)
  nRaters <- NCOL(ratings) # each rater is in a column
  stopifnot(nSj >= 1L, nRaters >= 2L)
  
  # raw kappa gold. 
  # @param idx row index of data to use (used for jackknifing)
  # @param what character. Which quantity from kappa2-list to work with? Default "value" is Cohen's kappa.
  kappam_gold0 <- function(idx, what = "value") {
    # Cohen's kappa for all pairwise ratings
    k2L <- purrr::map(.x = seq_len(nRaters)[-refIdx],
                      .f = ~ kappa2(ratings[idx, c(refIdx, .x), drop = FALSE],
                                    robust = robust, ratingScale = ratingScale))
    # drop invalid cases (no valid subjects)
    k2L <- purrr::compact(k2L)
    
    # build weighted average depending on available data
    #XXX better weighting would be via inverse of squared SE of kappa2!
    stats::weighted.mean(x = purrr::map_dbl(k2L, what),
                         w = purrr::map_dbl(k2L, "subjects"))
  }#fn
  
  # raw agreement proportion (has no bias in example runs)
  agreem <- kappam_gold0(idx = seq_len(nSj), what = "agreem")
  
  value0 <- kappam_gold0(idx = seq_len(nSj), what = "value")
  kgold_j <- victorinox(est = kappam_gold0, idx = seq_len(nSj))
  
  # bias corrected estimate
  value <- value0 - kgold_j$bias_j
  # standard error
  se_j <- kgold_j$se_j
  
  # 95% CI for bias-corrected estimate
  # stats::qt(1 - (1-conf.level)/2, df = max(1, nSubj-1)) * se
  ci <- value + c(-1, 1) * stats::qnorm(1 - (1-conf.level)/2) * se_j
  
  # return:
  list(
    method = "Averaged Cohen's Kappa with gold standard",
    subjects = nSj, raters = nRaters-1, categories = nCat,
    agreem = agreem,
    value0 = value0, value = value,
    se_j = se_j, conf.level = conf.level,
    ci.lo = ci[[1]], ci.hi = ci[[2]], ci.width = diff(ci)
  )
}

#' Simulate rating data and calculate agreement with gold standard
#' 
#' The function generates simulation data according to given categories and probabilities.
#' and can repeatedly apply function [kappam_gold()].
#' Currently, there is no variation in probabilities from rater to rater,
#' only sampling variability from multinomial distribution is at work.
#' 
#' 
#' This function is future-aware for the repeated evaluation of [kappam_gold()]
#' that is triggered by this function.
#' 
#' @examples
#' # repeatedly estimate agreement with goldstandard for simulated data
#' simulKappa(nRater = 8, cats = 3, nSubj = 11,
#'            # assumed prob for classification by raters
#'            probs = matrix(c(.6, .2, .1, # subjects of cat 1
#'                             .3, .4, .3, # subjects of cat 2
#'                             .1, .4, .5  # subjects of cat 3
#'            ), nrow = 3, byrow = TRUE))
#' 
#' 
#' @param nRater numeric. number of raters.
#' @param cats categories specified either as character vector or just the
#'   numbers of categories.
#' @param nSubj numeric. number of subjects per gold standard category. Either a
#'   single number or as vector of numbers per category, e.g. for non-balanced
#'   situation.
#' @param probs numeric square matrix (nCat x nCat) with classification
#'   probabilities. Row `i` has probabilities of rater categorization for
#'   subjects of category `i` (gold standard).
#' @param mcSim numeric. Number of Monte-Carlo simulations.
#' @param simOnly logical. Need only simulation data? Default is `FALSE`.
#' @returns dataframe of kappa-gold on the simulated datasets or (when
#'   `simOnly=TRUE`) list of length `mcSim` with each element a simulated data
#'   set with goldrating in first column and then the raters.
#' @export
simulKappa <- function(nRater, cats, nSubj, probs, mcSim = 10, simOnly=FALSE) {
  # check input
  if (is.numeric(cats)) {
    if (length(cats) != 1 || cats <= 2 || cats > 26^2) {
      stop("The number of categories given is invalid!", call. = FALSE)
    }#fi
    cats <- if (cats <= 26) {
      LETTERS[seq_len(cats)]
    } else {
      with(tidyr::expand_grid(c1 = LETTERS, c2 = LETTERS),
           paste0(c1, c2))[seq_len(cats)]
    }
  }#fi
  
  stopifnot(is.character(cats) || is.factor(cats))
  cats <- unique(as.character(cats))
  ncats <- length(cats)
  if (ncats <= 1) {
    stop("We need at least 2 categories!", call. = FALSE)
  }
  
  if (length(nSubj) == 1L) {
    nSubj <- rep.int(nSubj, times = ncats)
  }#fi
  
  stopifnot(is.numeric(nSubj), length(nSubj) == ncats,
            all(is.finite(nSubj)), all(nSubj >= 1L))
  
  if (missing(probs)) {
    stop("Please specify the assumed probabilities how subjects are rated.",
         call. = FALSE)
  }#fi
  stopifnot(is.matrix(probs), is.numeric(probs), NROW(probs) == ncats,
            NCOL(probs) == NROW(probs))
  nSubjTotal <- sum(nSubj)
  
  
  classif_of_rater <- vector(mode = 'list', length = ncats)
  classif_per_rater <- replicate(n = nRater, expr = {
    # currently, no variation in pp from rater to rater. could be added here.
    #+only sampling variability from multinomial distribution is at work
    for (ctgry in seq_along(cats)) {
      
      # raters act independently of each other,
      #+propensity of a subject to a category is not modelled individually
      # the order of ratings does not play a role here as we only look to the gold-standard
      # Within raters, the lexical ordering would increase the agreement
      classif_of_rater[[ctgry]] <- apply(
        X = stats::rmultinom(n=mcSim, size = nSubj[[ctgry]], prob = probs[ctgry,]),
        MARGIN = 2,
        FUN = function(x) rep.int(cats, times = x),
        simplify = FALSE)
    }#rof ctgry
    
    # per rater, combine the classifications for the subjects of the different categories
    purrr::pmap(classif_of_rater, .f = c)
  }, simplify = FALSE)
  rm(classif_of_rater)
  
  names(classif_per_rater) <- paste0("R", seq_len(nRater))
  
  
  goldStdOutcome <- rep.int(cats, times = nSubj)
  # build simulated data set per simulation round
  simData <- purrr::pmap(.l = classif_per_rater,
                         .f = function(...) # pass on arguments to tibble (in order to become columns)
                           tibble(gold = goldStdOutcome, # gold standard as first column
                                  ...,
                                  .rows = nSubjTotal))
  
  if (isTRUE(simOnly)) {
    return(simData)
  }
  
  resL <- future.apply::future_lapply(X = simData, FUN = kappam_gold)
  
  # return
  tibble(nRater = nRater, nSubjTotal = nSubjTotal,
         kappam_gold = purrr::map_dbl(.x = resL, "value"),
         ci_lo = purrr::map_dbl(.x = resL, "ci.lo"),
         ci_hi = purrr::map_dbl(.x = resL, "ci.hi"),
         ci_halfwidth = purrr::map_dbl(.x = resL, "ci.width") / 2L)
}