#' @title
#' Calculate Fleiss' kappa and Krippendorff's alpha
#'
#' @description
#' This function was adapted from the paper _Measuring inter-rater reliability
#' for nominal data – which coefficients and confidence intervals are
#' appropriate?_ by Zapf et al. Their work compared the two reliability measures
#' and confidence intervals to see which provide the best statistical properties
#' for the assessment of inter-rater reliability in different situations. With
#' their paper they provided supplemental R code which was adapted to this
#' function. Most is kept intact, but I adjusted the output to return a `tibble`
#' instead of a list and text like they had done.
#'
#' Their conclusions: Fleiss' K and Krippendorff's alpha with bootstrap
#' confidence intervals are equally suitable for the analysis of reliability of
#' complete nominal data. The asymptotic confidence interval for Fleiss' K
#' should not be used. In the case of missing data or data or higher than
#' nominal order, Krippendorff’s alpha is recommended.
#'
#' @references
#' \url{https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4974794/}
#' Efron B. Six questions raised by the bootstrap. Exploring the limits of
#' bootstrap. Editors LePage R, Billard L. Technical Report No. 139. Division of
#' Biostatistics, Stanford University. Wiley & Sons, New York; 1992.
#'
#'
#' @param ratings_t A matrix or data frame or tibble of the ratings. Rows =
#' individuals; columns = raters. Missing values coded by `NA`.
#' @param alpha_q Numeric; two-sided type one error, default = 0.05
#' @param nboot Integer; number of Bootstrap samples, default = 1000
#' @param scaling String; measurement scale ("nominal", "ordinal", "interval",
#' "ratio"), default = "nominal"
#'
#' @importFrom dplyr distinct
#' @importFrom dplyr group_by
#' @importFrom dplyr mutate
#' @importFrom dplyr n_distinct
#' @importFrom dplyr pull
#' @importFrom dplyr rename
#' @importFrom dplyr row_number
#' @importFrom dplyr summarise
#' @importFrom dplyr ungroup
#' @importFrom tibble as_tibble
#' @importFrom tibble tibble
#' @importFrom tidyr complete
#' @importFrom tidyr fill
#' @importFrom tidyr pivot_longer
#' @importFrom stats na.omit
#' @importFrom stats quantile
#'
#' @return A tibble with the following columns
#' \item{measure}{Name of the measure}
#' \item{scale}{Measurement scale}
#' \item{N_subjects}{For kappa: number of subjects without missing values; For
#' alpha: number of subjects with two or more ratings.}
#' \item{n_raters}{Number of raters}
#' \item{k_categories}{Number of categories}
#' \item{obs_agr}{Observed agreement for the complete cases and for all cases
#' with at least two ratings}
#' \item{estimate}{Point estimates: Fleiss' kappa and Krippendorff's alpha}
#' \item{lower_asym_ci}{Lower asymptotic confidence interval}
#' \item{upper_asym_ci}{Upper asymptotic confidence interval}
#' \item{lower_boot_ci}{Lower bootstrap confidence interval}
#' \item{upper_boot_ci}{Upper bootstrap confidence interval}
#' @export
#'
#' @examples
#' ratings_t <- matrix(ncol = 3, nrow = 10,
#' c(5, 5, 5,
#' 3, 5, 5,
#' 1, 4, 4,
#' 3, 3, 3,
#' 4, 4, 5,
#' 1, 3, 4,
#' 3, 3, 3,
#' 1, 1, 3,
#' 2, 2, 5,
#' 3, 3, 4),
#' byrow = TRUE)
#' k_alpha(ratings_t,
#' alpha_q = 0.05,
#' nboot = 1000,
#' scaling = "nominal")
#' k_alpha(as.data.frame(ratings_t),
#' alpha_q = 0.05,
#' nboot = 1000,
#' scaling = "nominal")
#' k_alpha(tibble::as_tibble(ratings_t),
#' alpha_q = 0.05,
#' nboot = 1000,
#' scaling = "nominal")
k_alpha <- function(ratings_t,
alpha_q = 0.05,
nboot = 1000,
scaling = "nominal") {
#### Helper functions to estimate stats --------------------------------
## Function for the estimation of Fleiss' K ----------------
k_func <- function(N, n, k, ratings, categ) {
# n_ij = number of raters who classed subject i in category j
n_ij = matrix(ncol = k, nrow = N)
step = 1
for (j in categ) {
for (i in 1:N) {
n_ij[i, step] = sum(as.numeric(ratings[i, ] == j))
}
step = step + 1
}
# estimation of K_j
p_j = apply(n_ij, 2, sum) / (N * n)
q_j = 1 - p_j
k_j = 1 - apply(n_ij * (n - n_ij), 2, sum) / (N * n * (n - 1) * p_j *
q_j)
# estimation of the overall K
k_t = sum(p_j * q_j * k_j) / sum(p_j * q_j)
return(list(k_t, p_j, q_j))
}
## Function for the estimation of Krippendorff's alpha ----------------
alpha_func <- function(k, n, N, ratings, categ) {
# conicidence matrix
CM = matrix(ncol = k, nrow = k, 0)
vn <- function(datavec)
sum(!is.na(datavec))
if (any(is.na(ratings)))
mc = apply(ratings, 1, vn) - 1
else
mc = rep(n - 1, N)
for (i in 1:N) {
for (j in 1:(n - 1)) {
for (jt in (j + 1):n) {
if (!is.na(ratings[i, j]) && !is.na(ratings[i, jt])) {
index1 = which(categ == ratings[i, j])
index2 = which(categ == ratings[i, jt])
CM[index1, index2] = CM[index1, index2] + (1 + (index1 == index2)) /
mc[i]
if (index1 != index2) {
CM[index2, index1] = CM[index1, index2]
}
}
}
}
}
nmv <- sum(apply(CM, 2, sum))
nc = apply(CM, 1, sum)
ncnk = matrix(0, nrow = k, ncol = k)
# matrix of expected disagreement
D_e = matrix(0, ncol = k, nrow = k)
for (C in 1:k) {
for (Ct in 1:k) {
if (C == Ct) {
D_e[C, Ct] = nc[C] * (nc[Ct] - 1) / (nmv - 1)
}
if (C != Ct) {
D_e[C, Ct] = nc[C] * nc[Ct] / (nmv - 1)
}
ncnk[C, Ct] = nc[C] * nc[Ct]
ncnk[Ct, C] = ncnk[C, Ct]
}
}
# matrix of metric differences
diff2 = matrix(0, nrow = k, ncol = k)
# nominal
if (match(scaling[1], "nominal", 0)) {
diff2 = matrix(1, ncol = k, nrow = k)
diag(diff2) = 0
}
# ordinal
if (match(scaling[1], "ordinal", 0)) {
for (C in 1:k) {
for (Ct in 1:k) {
if (C != Ct) {
tmp = nc[C:Ct]
diff2[C, Ct] = (sum(tmp) - nc[C] / 2 - nc[Ct] / 2) ^ 2
diff2[Ct, C] = diff2[C, Ct]
}
}
}
}
# interval
if (match(scaling[1], "interval", 0)) {
for (C in 1:k) {
for (Ct in 1:k) {
if (C != Ct) {
diff2[C, Ct] = (as.numeric(categ)[C] - as.numeric(categ)[Ct]) ^ 2
diff2[Ct, C] = diff2[C, Ct]
}
}
}
}
# ratio
if (match(scaling[1], "ratio", 0)) {
for (C in 1:k) {
for (Ct in 1:k) {
if (C != Ct) {
diff2[C, Ct] = ((as.numeric(categ)[C] - as.numeric(categ)[Ct]) /
(as.numeric(categ)[C] + as.numeric(categ)[Ct])) ^
2
diff2[Ct, C] = diff2[C, Ct]
}
}
}
}
# point estimator of Krippendorff's alpha
tmp = diff2 * CM
num = sum(tmp)
tmp = diff2 * D_e
den = sum(tmp)
if (den > 0) {
alpha_boot = 1 - num / den
}
if (den <= 0) {
alpha_est = NA
}
return(alpha_boot)
}
#### Fleiss' K --------------------------------
# check, if measurement scale is nominal
if (match(scaling[1], "nominal", 0)) {
# deleting all subjects with missing values
ratings_c <- as.matrix(na.omit(ratings_t))
# N = number of subjects, n = number of raters, k = number of categories
N_c = nrow(ratings_c)
v = function(dat) {
min(dat) == max(dat)
}
agr_k = sum(apply(ratings_c, 1, v)) / N_c
# check, if there are at least two individuals without missing values
if (N_c < 2) {
print(paste0("There are less than two subjects without missing ",
"values. Therefore, Fleiss' K cannot be calculated."))
}
if (N_c >= 2) {
n_c = ncol(ratings_c)
categ_c = levels(as.factor(ratings_c))
k_c = length(categ_c)
# point estimator of Fleiss` K
k_ = k_func(N_c, n_c, k_c, ratings_c, categ_c)
k_est = k_[[1]]
p_j = k_[[2]]
q_j = k_[[3]]
## asymptotic confidence interval ----------------
# estimation of the standard error
se_k = (sqrt(2) / (sum(p_j * q_j) * sqrt(N_c * n_c * (n_c - 1)))) *
sqrt(sum(p_j * q_j) ^ 2 - sum(p_j * q_j * (q_j - p_j)))
# asymptotic confidence interval for Fleiss' K
CI_asymp_k = k_est + c(-1, 1) * qnorm(1 - alpha_q / 2) * se_k
}
}
#### Krippendorff's alpha --------------------------------
# deleting all subject with less than two ratings
f <- function(x) sum(!is.na(x))
# deleting all subjects with only one rating
ratings = as.matrix(ratings_t[apply(ratings_t, 1, f) > 1, ])
# N = number of subjects, n = number of raters, k = number of categories
N_kr = nrow(ratings)
v <- function(dat) {
min(dat, na.rm = TRUE) == max(dat, na.rm = TRUE)
}
agr_alpha = sum(apply(ratings, 1, v)) / N_kr
n_kr = ncol(ratings)
categ = levels(as.factor(ratings))
k_kr = length(categ)
# point estimator of Krippendorff's alpha
alpha_est = alpha_func(k_kr, n_kr, N_kr, ratings, categ)
#### Bootstrap confidence intervals --------------------------------
# K and alpha in each Bootstrap sample
k_boot = 0
alpha_boot = 0
for (iboot in 1:nboot) {
if (match(scaling[1], "nominal", 0)) {
index.new = sample(seq(1, N_c, 1), N_c, replace = TRUE)
ratings_b = ratings_c[index.new, ]
n = ncol(ratings_b)
categ = levels(as.factor(ratings_b))
k = length(categ)
k.b <- k_func(N_c, n, k, ratings_b, categ)[[1]]
k_boot = c(k_boot, k.b)
}
f <- function(x) sum(!is.na(x))
# deleting all subjects with only one rating
index.new = sample(seq(1, N_kr, 1), N_kr, replace = TRUE)
ratings_b = ratings[index.new, ]
n = ncol(ratings)
categ = levels(as.factor(ratings))
k = length(categ)
alpha_b = alpha_func(k, n, N_kr, ratings_b, categ)
alpha_boot = c(alpha_boot, alpha_b)
}
# confidence interval using the percentiles from the Bootstrap samples
if (match(scaling[1], "nominal", 0)) {
CI_boot_k = quantile(k_boot[-1],
probs = c(alpha_q / 2, 1 - alpha_q / 2),
na.rm = TRUE)
}
# confidence interval using the percentiles from the Bootstrap samples
CI_boot_alpha = quantile(alpha_boot[-1],
probs = c(alpha_q / 2, 1 - alpha_q / 2),
na.rm = TRUE)
#### Observed Agreement --------------------------------
# These, c(agr_k, agr_alpha), don't seem to calculate the correct observed
# agreement. Doing it my own way.
N <- N_subjects <- Pi <- category <- k_categories <- n_raters <- nij <- rater <- subject <- NULL
calc_step <- ratings_t |>
tibble::as_tibble(.name_repair = "unique") |>
mutate(subject = dplyr::row_number()) |>
tidyr::pivot_longer(cols = -subject,
names_to = "rater",
values_to = "category") |>
mutate(N_subjects = dplyr::n_distinct(subject),
n_raters = dplyr::n_distinct(rater),
k_categories = dplyr::n_distinct(category)) |>
dplyr::count(N_subjects,
n_raters,
k_categories,
subject,
category) |>
tidyr::complete(subject,
category,
fill = list(n = 0)) |>
tidyr::fill(N_subjects:k_categories,
.direction = "downup") |>
dplyr::rename(nij = n) |>
mutate(N = sum(nij)) |>
group_by(category) |>
mutate(pj = sum(nij) / N) |>
ungroup() |>
group_by(subject) |>
mutate(Pi = nij ^ 2,
Pi = sum(Pi),
Pi = Pi - n_raters,
Pi = Pi / (n_raters * (n_raters - 1))) |>
ungroup()
pbar <- calc_step |>
dplyr::distinct(subject,
Pi) |>
summarise(pbar = mean(Pi)) |>
dplyr::pull()
agr_k <- agr_alpha <- pbar
#### Output --------------------------------
## For nominal data ----------------
if (match(scaling[1], "nominal", 0)) {
return(tibble::tibble(
measure = c("Fleiss' kappa", "Krippendorff's alpha"),
scale = c(scaling[1], scaling[1]),
# for kappa: N (number of subjects without missing values)
# alpha: N (number of subjects with two or more ratings)
N_subjects = c(N_c, N_kr),
n_raters = c(n_c, n_kr),
k_categories = c(k_c, k_kr),
obs_agr = c(agr_k, agr_alpha),
estimate = c(k_est, alpha_est),
lower_asym_ci = c(CI_asymp_k[[1]], NA),
upper_asym_ci = c(CI_asymp_k[[2]], NA),
lower_boot_ci = c(CI_boot_k[[1]], CI_boot_alpha[[1]]),
upper_boot_ci = c(CI_boot_k[[2]], CI_boot_alpha[[2]])
))
}
## For not nominal data ----------------
if (!match(scaling[1], "nominal", 0)) {
# Fleiss' K cannot be calculated, because it is only appropriate for nominal
# data.
tibble::tibble(
measure = c("Fleiss' kappa", "Krippendorff's alpha"),
scale = c(scaling[1], scaling[1]),
# for kappa: N (number of subjects without missing values)
# alpha: N (number of subjects with two or more ratings)
N_subjects = c(NA, N_kr),
n_raters = c(NA, n_kr),
k_categories = c(NA, k_kr),
obs_agr = c(NA, agr_alpha),
estimate = c(NA, alpha_est),
lower_asym_ci = c(NA, NA),
upper_asym_ci = c(NA, NA),
lower_boot_ci = c(NA, CI_boot_alpha[[1]]),
upper_boot_ci = c(NA, CI_boot_alpha[[2]])
)
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.