tests/testthat/test-kappa.R

# data -----------------------------------------------------------------

# cf dataset anxiety from irr-package
anxiety <- data.frame(rater1 = c(3L, 3L, 3L, 4L, 5L, 5L, 2L, 3L, 5L, 2L, 2L, 6L, 1L, 5L, 2L, 2L, 1L, 2L, 4L, 3L),
                      rater2 = c(3L, 6L, 4L, 6L, 2L, 4L, 2L, 4L, 3L, 3L, 2L, 3L, 3L, 3L, 2L, 2L, 1L, 3L, 3L, 4L),
                      rater3 = c(2L, 1L, 4L, 4L, 3L, 2L, 1L, 6L, 1L, 1L, 1L, 2L, 3L, 3L, 1L, 1L, 3L, 3L, 2L, 2L))

# example data for 3-level categorical UICC rating
# gold standard in first column with no NA
# ordinary raters have many NAs
uicc <- structure(list(
  GS = c("I", "II", "II", "III", "I", "III", "I", 
         "I", "III", "II", "III", "II", "I", "II", "II", "III", "III", 
         "I", "I", "I", "III", "I", "III", "III", "III", "II", "III", 
         "III", "I", "I", "I", "I", "III", "I", "I", "I", "I", "II", "II", 
         "I", "I", "III", "III", "II", "I", "I", "III", "II", "III", "I", 
         "I"),
  rater1 = c("III", "II", "II", "II", "III", "III", NA, "I", 
             NA, "III", NA, "I", "I", "III", NA, "III", "III", "I", "I", "I", 
             NA, "I", NA, NA, NA, NA, NA, "I", "I", "II", NA, "I", "II", NA, 
             "II", "III", "I", "II", "III", "II", NA, "II", NA, NA, "I", "I", 
             "III", NA, "III", NA, "I"),
  rater2 = c("II", "II", "II", "II", 
             NA, "III", "II", NA, "II", "II", "III", "I", "II", NA, NA, "III", 
             "I", "I", "I", "I", NA, "III", "III", "II", "II", NA, "I", "II", 
             "I", "II", "I", "I", "I", NA, NA, "II", "I", "II", NA, "II", 
             "I", NA, "II", "II", "II", "I", NA, "III", NA, "II", "II"),
  rater3 = c(NA, 
             "II", "II", "I", "III", "III", "I", "III", "I", "III", "III", 
             "I", "II", "III", "III", "III", "I", "I", "II", "II", "III", 
             "III", "III", "II", "III", "III", "I", NA, "I", "I", NA, "I", 
             "I", NA, "III", "III", "II", NA, "III", "III", "III", "III", 
             "III", "III", NA, NA, "II", "III", "III", "III", "II"),
  rater4 = c(NA, 
             NA, NA, NA, NA, NA, "I", NA, NA, "II", NA, "I", NA, NA, NA, "I", 
             NA, NA, "I", "I", "II", "III", "III", "II", "II", NA, "I", "II", 
             "I", "I", "I", "I", "II", NA, NA, "II", "I", "II", "III", "I", 
             "II", "III", "II", "III", "II", "I", "III", "II", "III", NA, 
             "I"),
  rater5 = c("I", "II", "I", "I", NA, NA, "I", NA, NA, "III", 
             NA, "I", "II", "III", NA, "I", "I", "I", "I", "II", NA, "III", 
             NA, "III", NA, "II", "III", "II", "I", "I", NA, "I", "I", "I", 
             "I", NA, "II", "II", "II", "II", "II", "III", "II", "II", "I", 
             "II", "II", "II", "III", "II", "II"),
  rater6 = c(NA, "II", "II", 
             "II", NA, "III", "II", "I", "II", "III", "III", "I", "I", "III", 
             "III", NA, "I", "I", "II", "I", NA, "III", "III", NA, NA, "II", 
             "I", "II", "I", "II", "III", "II", "II", "I", NA, "II", "II", 
             "II", "III", "II", "II", "III", NA, "II", "I", "I", "III", "II", 
             "III", NA, "II"),
  rater7 = c(NA, "II", "II", "II", "III", "III", 
             "III", "I", "I", "II", "II", "II", "II", "III", "III", "III", 
             NA, "II", "I", "II", NA, "III", "III", "II", "II", "III", "III", 
             NA, "I", "II", "III", "I", "II", NA, "III", "III", "I", "II", 
             "III", "III", "III", "III", "III", "III", "I", "I", "II", "III", 
             "III", NA, "I"),
  rater8 = c("I", NA, "I", NA, "II", "III", "III", 
             "III", "I", "III", "III", "I", NA, "III", "III", NA, "III", NA, 
             NA, NA, "III", "III", "III", NA, NA, "III", "III", NA, NA, NA, 
             NA, NA, "I", "III", "III", "III", NA, NA, "III", "III", NA, "III", 
             "III", "III", NA, "III", "III", "III", "III", "III", NA),
  rater9 = c(NA, 
             NA, "I", NA, NA, NA, "II", NA, NA, "III", "III", NA, NA, "III", 
             "II", "III", "I", "II", "I", "III", "III", "III", "III", "II", 
             "II", "III", "III", "III", "I", "I", NA, NA, "I", "I", "III", 
             NA, "II", "I", "III", "II", "III", "III", NA, NA, "II", NA, "III", 
             NA, "III", "II", "II"),
  rater10 = c("II", "I", "I", "II", "III", 
              "III", "III", "II", "I", NA, "I", NA, "I", NA, NA, "II", NA, 
              "I", "I", NA, NA, "III", NA, "II", "I", NA, "I", "I", "II", "I", 
              NA, "I", NA, "III", NA, "III", "I", NA, "III", "II", NA, "III", 
              NA, NA, "I", NA, "III", "I", NA, "III", "I"),
  rater11 = c("II", 
              "I", "II", "I", "I", "III", NA, NA, "III", NA, NA, "I", "I", 
              "III", "II", "III", NA, "III", NA, "I", "III", NA, "III", "II", 
              "I", "II", "I", "I", "I", "I", NA, "I", NA, NA, NA, "III", "I", 
              "II", "III", "III", "II", "III", NA, NA, "II", "I", NA, NA, NA, 
              NA, NA),
  rater12 = c(NA, NA, NA, "III", "III", "III", "III", 
              NA, "III", "III", "III", NA, "I", "III", NA, "III", NA, "I", 
              NA, "III", "III", "III", "III", "III", "III", "II", "III", "II", 
              "I", "I", NA, NA, NA, "III", NA, "III", "II", "III", "III", NA, 
              NA, "III", "III", NA, NA, "II", "III", NA, "III", NA, "II"), 
  rater13 = c("I", "I", "I", "I", "III", "III", NA, NA, "II", 
              "III", "I", "I", NA, NA, "III", "II", "I", "III", "I", "I", 
              "III", NA, NA, NA, NA, NA, NA, NA, NA, "I", NA, "I", "II", 
              "II", NA, "III", "I", NA, "III", NA, "III", "II", "III", 
              NA, "I", "I", "III", "III", NA, "II", "I"),
  rater14 = c("II", "II", "II", "II", "II", "II", "II", "II", "II", NA, "II", 
              "II", "II", "II", NA, "II", "I", "II", "II", "II", "II", 
              NA, NA, "II", "II", "II", "I", "I", "I", "II", "I", "I", 
              "II", "I", "I", "II", "II", "II", "II", "I", "II", "III", 
              "II", "I", "II", NA, "II", "II", NA, NA, "II"), 
  rater15 = c("II", "II", "II", "II", NA, NA, "I", "II", "II", 
              "II", "III", "I", "II", "III", "III", "III", "I", "II", "I", 
              "III", NA, "III", "III", "III", "II", "II", "I", "I", "I", 
              "I", "I", "I", "II", "III", "I", NA, NA, "II", "III", "II", 
              "II", "III", "II", "II", "I", "I", "II", "II", NA, "II", 
              "I")),
  row.names = c("sj1", "sj2", "sj3", "sj4", 
                "sj5", "sj6", "sj7", "sj8", "sj9", "sj10", "sj11", 
                "sj12", "sj13", "sj14", "sj15", "sj16", "sj17", "sj18", 
                "sj19", "sj20", "sj21", "sj22", "sj23", "sj24", "sj25", 
                "sj26", "sj27", "sj28", "sj29", "sj30", "sj31", "sj32", 
                "sj33", "sj34", "sj35", "sj36", "sj37", "sj38", "sj39", 
                "sj40", "sj41", "sj42", "sj43", "sj44", "sj45", "sj46", 
                "sj47", "sj48", "sj49", "sj50", "sj51"),
  class = "data.frame")


# cf Fleiss, Cohen, Everitt "large sample std err of kappa", 1969
# Table1 (and also Table2)
FCEdat <- data.frame(n = as.integer(200 * c(.53, .11, .01,
                                            .05, .14, .06,
                                            .02, .05, .03)),
                     R1 = rep.int(c("I", "II", "III"), times = 3),
                     R2 = rep(c("I", "II", "III"), each = 3)) |> 
  tidyr::uncount(weights = n)


test_that("ratings data sets", {
  expect_true(is.matrix(diagnoses))
  expect_type(diagnoses, type = "character")
  expect_s3_class(anxiety, class = "data.frame")
  expect_s3_class(uicc, class = "data.frame")
  expect_s3_class(FCEdat, class = "data.frame")
  expect_identical(NROW(FCEdat), expected = 200L)
  expect_identical(NCOL(FCEdat), expected = 2L)
  
  # 1st column is gold standard
  expect_identical(names(uicc)[[1L]], expected = "GS")
  expect_identical(sum(is.na(uicc$GS)), expected = 0L)
})



# kappa -------------------------------------------------------------------


test_that("kappa2 implementation", {
  # select two columns manually
  k2_diag23 <- kappa2(diagnoses[, 2:3])
  expect_type(k2_diag23, type = "list")
  expect_named(k2_diag23,
               expected = c("method", "subjects", "raters", "categories",
                            "robust", "agreem", "value", "se"))
  expect_equal(k2_diag23$value,
               expected = irr::kappa2(diagnoses[,2:3])$value)
  
  # all sets of two columns
  purrr::walk(.x = utils::combn(NCOL(diagnoses), 2L, simplify = FALSE),
              .f = ~ expect_equal(kappa2(diagnoses[, .x])$value,
                                 expected = irr::kappa2(diagnoses[, .x])$value))
  
  # cf Fleiss, Cohen, Everitt "large sample std err of kappa", 1969
  # unweighted kappa (after Table2)
  k2_FCE <- kappa2(FCEdat)
  expect_equal(k2_FCE$value, expected = .429, tolerance = 1e-3)
  expect_equal(k2_FCE$se^2, expected = .002885, tolerance = 1e-4)
})


test_that("kappam_fleiss", {
  diag_kf <- kappam_fleiss(ratings = diagnoses, variant = "fleiss")
  # actually, Conger does not fit here to the diagnoses dataset
  #+as we have no select set of six fixed raters!
  diag_kc <- kappam_fleiss(ratings = diagnoses, variant = "conger")
  
  K_NAMES <- c("method", "subjects", "raters", "categories", "ratings", "balanced",
               "agreem", "value0", "value", "se_j", "stat.name", "statistic", "p.value")
  expect_type(diag_kf, "list")
  expect_named(diag_kf, expected = c(K_NAMES, "se0", "statistic0", "p.value0"))
  expect_equal(diag_kf$value0, expected = 0.43024, tolerance = 1e-4) # from irr::kappam.fleiss
  
  expect_type(diag_kc, "list")
  expect_named(diag_kc, expected = K_NAMES)
  # diagnoses with un-permuated rater1-rater6: 0.44181
  expect_equal(diag_kc$value0, expected = 0.43112, tolerance = 1e-4) # from irr::kappam.fleiss
  
  # Fleiss, "Stat. methods for rates and prop.", 3rd ed, section 18.3, table 18.8:
  # five ratings on each of ten subjects into one of three categories
  # we assume that we have identical raters
  fleissmEx18.8 <- list(
    nSj = 10,
    nRt = 5)
  
  fleissmEx18.8$tab_cnt_sj <- matrix(c(1, 4, 0,
                                   2, 0, 3,
                                   0, 0, 5,
                                   4, 0, 1,
                                   3, 0, 2,
                                   1, 4, 0,
                                   5, 0, 0,
                                   0, 4, 1,
                                   1, 0, 4,
                                   3, 0, 2),
                                 nrow = fleissmEx18.8$nSj, byrow = TRUE,
                                 dimnames = list(subj = paste0("S", seq_len(fleissmEx18.8$nSj)),
                                                 cat = LETTERS[1:3]))
  fleissmEx18.8$prob_tab <- colSums(fleissmEx18.8$tab_cnt_sj) / sum(fleissmEx18.8$tab_cnt_sj)
  
  
  fleissmEx18.8$rat <- fleissmEx18.8$tab_cnt_sj |> 
    tibble::as_tibble(rownames = "sjID") |> 
    tidyr::pivot_longer(cols = c(A, B, C), names_to = "Category", values_to = "Cnt") |> 
    tidyr::uncount(weights = Cnt) |> 
    # optionally, permute rows within sjID to have exchangeable raters
    #+with no permutation it is ordered and first rater tends to give A etc
    dplyr::mutate(rID = dplyr::row_number(), .by = sjID) |> 
    tidyr::pivot_wider(id_cols = sjID,
                       names_from = rID, names_prefix = "R",
                       values_from = Category) |> 
    tibble::column_to_rownames(var = "sjID")
  
  fleissmEx18.8$kappa <- kappam_fleiss(ratings = fleissmEx18.8$rat, detail = TRUE)
  # 
  with(fleissmEx18.8, {
    expect_identical(kappa$categories, 3L)
    expect_equal(kappa$value0, 0.42, tolerance = .01) #value given by Fleiss
    expect_equal(kappa$value0,
                 1 - (nSj * nRt^2 - sum(tab_cnt_sj^2)) / (nSj * nRt * (nRt-1) * crossprod(prob_tab, 1-prob_tab)[1]))
    expect_equal(kappa$se0, 0.072, tolerance = .01)
    expect_equal(kappa$statistic0, 5.83, tolerance = .01)
    expect_identical(colnames(kappa$detail), expected = c("kappa_j", "se0_j", "z_j", "p.value_j"))
    expect_equal(kappa$detail[, "kappa_j"], c(A=.29, B=.67, C=.35), tolerance = .01)
    expect_equal(kappa$detail[, "se0_j"], rep(0.10, 3), ignore_attr = TRUE)
  })
  
  # cf Fleiss (2003), sec 18.3, table 18.7
  fleissmEx18.7 <- matrix(c(2, 2,
                            2, 0,
                            3, 2,
                            4, 3,
                            3, 3,
                            4, 1,
                            3, 0,
                            5, 0,
                            2, 0,
                            4, 4,
                            5, 5,
                            3, 3,
                            4, 4,
                            4, 3,
                            2, 0,
                            2, 2,
                            3, 1,
                            2, 1,
                            4, 1,
                            5, 4,
                            3, 2,
                            4, 0,
                            3, 0,
                            3, 3,
                            2, 2), ncol = 2, byrow = TRUE,
                          dimnames = list(subj = paste0("S", seq_len(25)),
                                          cnt = c("m_i", "x_i"))) |> 
    tibble::as_tibble(rownames = "sjID") |> 
    dplyr::rename(A = x_i) |> 
    dplyr::mutate(B = m_i - A, m_i = NULL) |> 
    tidyr::pivot_longer(cols = c(A, B), names_to = "Category", values_to = "Cnt") |> 
    tidyr::uncount(weights = Cnt) |> 
    # optionally, permute rows within sjID to have exchangeable raters
    #+with no permutation it is ordered and first rater tends to give A etc
    dplyr::mutate(rID = dplyr::row_number(), .by = sjID) |> 
    tidyr::pivot_wider(id_cols = sjID,
                       names_from = rID, names_prefix = "R",
                       values_from = Category) |> 
    tibble::column_to_rownames(var = "sjID")
  
  kfEx18.7 <- kappam_fleiss(ratings = fleissmEx18.7)
  kfEx18.7_cong <- kappam_fleiss(ratings = fleissmEx18.7, variant = "cong")
  kfEx18.7_rob <- kappam_fleiss(ratings = fleissmEx18.7, variant = "rob")
  
  expect_identical(kfEx18.7$subjects, 25L)
  expect_identical(kfEx18.7$raters, 5L)
  expect_identical(kfEx18.7$ratings, 81L)
  expect_identical(kfEx18.7$categories, 2L)
  expect_false(kfEx18.7$balanced)
  expect_equal(kfEx18.7$value0, 0.54, tolerance = .05) #cf sect 18.3, p. 613
  expect_equal(kfEx18.7$se0, 0.103, tolerance = .01)
  expect_equal(kfEx18.7$statistic0, 5.24, tolerance = .05)
  
  # conger reduces the chance agreement a little bit, hence kappa increases
  expect_gt(kfEx18.7_cong$value0, kfEx18.7$value0)
  # robust assumes smallest possible chance agreement, hence kappa increases
  expect_gt(kfEx18.7_rob$value0, kfEx18.7$value0)
})


test_that("kappam_gold", {
  
  # take first rater as reference
  kg_anx <- kappam_gold(ratings = anxiety)
  
  expect_identical(kg_anx$subjects, expected = NROW(anxiety))
  expect_identical(kg_anx$raters, expected = NCOL(anxiety)-1)
  # kappam_gold as average of pairwise Cohen's kappa
  expect_equal(kg_anx$value0,
               expected = mean(purrr::map_dbl(2L:NCOL(anxiety), ~ kappa2(ratings = anxiety[, c(1L,.x)])$value)))
  
  
  kg_diag <- kappam_gold(ratings = diagnoses)
  expect_identical(kg_diag$subjects, expected = NROW(diagnoses))
  expect_identical(kg_diag$raters, expected = NCOL(diagnoses)-1)
  # kappam_gold as average of pairwise Cohen's kappa
  expect_equal(kg_diag$value0,
               expected = mean(purrr::map_dbl(2:NCOL(diagnoses),
                                              ~ kappa2(ratings = diagnoses[, c(1L, .x)])$value)))
  
  # copes with NAs
  expect_no_error({kg_uicc <- kappam_gold(ratings = uicc)})
  expect_type(kg_uicc, type = "list")
  expect_named(kg_uicc, expected = c("method",
                                     "subjects", "raters", "categories",
                                     "agreem",
                                     "value0", "value",
                                     "se_j", "conf.level", "ci.lo", "ci.hi", "ci.width"))
  expect_identical(kg_uicc$subjects, NROW(uicc))
  expect_identical(kg_uicc$raters, NCOL(uicc)-1)
  # pairwise Cohen's kappa with gold standard
  k2_uicc <- purrr::map(.x = 2L:NCOL(uicc), .f = ~ kappa2(ratings = uicc[, c(1L, .x)]))
  expect_identical(purrr::map_dbl(k2_uicc, "subjects"), expected = colSums(!is.na(uicc[,-1L])),
                   ignore_attr = "names")
  expect_equal(kg_uicc$value0,
               expected = weighted.mean(x = purrr::map_dbl(k2_uicc, "value"),
                                        w = purrr::map_dbl(k2_uicc, "subjects")))
})


test_that("kappa simulation", {
  simK <- simulKappa(nRater = 8, cats = 3, nSubj = 11, mcSim = 9,
             # assumed prob for classification by raters
             probs = matrix(c(.6, .2, .1, # subjects of cat 1
                              .3, .4, .3, # subjects of cat 2
                              .1, .4, .5  # subjects of cat 3
             ), nrow = 3, byrow = TRUE))
  
  expect_type(simK, "list")
  expect_s3_class(simK, class = "data.frame")
  
  expect_named(simK,
               expected = c("nRater", "nSubjTotal", "kappam_gold", "ci_lo", "ci_hi", "ci_halfwidth"))
  expect_identical(simK$nRater, expected = rep.int(8, times = 9))
  
})

Try the kappaGold package in your browser

Any scripts or data that you put into this service are public.

kappaGold documentation built on April 4, 2025, 1:02 a.m.