In tjmahr/iccbot: A 'Shiny' App for ICC Statistics

library(tidyverse)
library(flexdashboard)
library(shiny)
library(irr)
library(printy)
library(iccbot)

# Default dataset from Shrout and Fleiss.
d <- example_shrout_fleiss()

getData <- function() {
  if (is.null(input$file1)) {
    d
  } else {
    read.csv(input$file1$datapath)
  }
}

runICC <- function() {
  req(input$use_twoway_model)
  req(input$single_or_average)
  req(input$agreement_or_consistency)

  model <- ifelse(
    input$use_twoway_model == "yes (two-way model)", 
    "twoway", 
    "oneway"
  )
  unit  <- ifelse(
    input$single_or_average == "single rating", 
    "single", 
    "average"
  )
  type <- ifelse(
    input$agreement_or_consistency == "absolute agreement", 
    "agreement", 
    "consistency"
  )

  missing_data <- anyNA(getData())
  engine <- ifelse(missing_data, "lme4", "irr")

  add_formatted_results_to_icc(
    run_icc(
      getData(),
      model = model,
      unit = unit,
      type = type,
      engine = engine
    )
  )
}

lme4_span <- HTML(
  "<span class=\"citation\">Bates, Mächler, Bolker, &amp; Walker, 2015</span>"
)
irr_span <- HTML(
  "<span class=\"citation\">Gamer, Lemon, Fellows, &amp; Singh, 2019</span>"
)

i <- reactive(list(
  subjects = runICC()[["subjects"]],
  n_trials = runICC()[["raters"]],
  unit_p = runICC()[["unit_p"]],
  unit_p2 = runICC()[["unit_p2"]],
  type_p = runICC()[["type_p"]],
  model_p = runICC()[["model_p"]],
  icc.name = runICC()[["icc.name"]],
  value = runICC()[["value"]],
  lbound_p = runICC()[["lbound_p"]],
  ubound_p = runICC()[["ubound_p"]],
  raters_p = runICC()[["raters_p"]],
  rater_participant_counts_p = runICC()[["rater_participant_counts_p"]],
  engine = runICC()[["engine"]],
  n_ratings = runICC()[["n_ratings"]],
  n_ratings_missing = runICC()[["n_ratings_missing"]],
  rater_participant_counts_p = runICC()[["rater_participant_counts_p"]],
  min_ratings_per_participant = runICC()[["min_ratings_per_participant"]],
  max_ratings_per_participant = runICC()[["max_ratings_per_participant"]],
  min_participants_per_rater = runICC()[["min_participants_per_rater"]],
  max_participants_per_rater = runICC()[["max_participants_per_rater"]],
  citation = if (runICC()[["engine"]] == "lme4") {
    glue::glue('
      the R packages lme4 (vers. {packageVersion("lme4")}; {lme4_span}) 
      and irr (vers. {packageVersion("irr")}; {irr_span})')
  } else {
    glue::glue('the irr R package (vers. {packageVersion("irr")}; {irr_span})')
  }
))

Column {.sidebar}

Upload a csv file of scores with one column per rater and no other columns.

fileInput(
  "file1", 
  "Choose CSV File", 
  accept = c(
    "text/csv",
    "text/comma-separated-values,text/plain",
    ".csv")
)

selectInput(
  "use_twoway_model",
  label = "Do raters evaluate more than one participant?", 
  choices = c("yes (two-way model)", "no (one-way model)"), 
  multiple = FALSE 
)

selectInput(
  "single_or_average",
  label = "Do you want the reliability for a single rating or the reliability for the average rating?", 
  choices = c("single rating", "average rating"),
  multiple = FALSE 
)

renderUI({
  req(input$use_twoway_model)

  choices <- if (input$use_twoway_model == "no (one-way model)") {
    c("absolute agreement")
  } else {
    c("absolute agreement", "consistency")
  }

  label <- if (input$use_twoway_model == "no (one-way model)") {
    HTML("<s>Do you want to estimate agreement of raters or consistency of raters?</s> 
         Consistency is not available for one-way models.")
  } else {
    "Do you want to estimate agreement of raters or consistency of raters?"
  }

  selectInput(
    "agreement_or_consistency",
    label = label, 
    choices = choices,
    multiple = FALSE
  )
})

Developed by TJ Mahr for The WISC Lab.

Run ICC Bot

Column {data-width=400}

ICC Bot says

We calculated the interrater reliability of [instrument name] with the intraclass correlation coefficient (ICC) estimated using r renderUI(HTML(i()[["citation"]])). r renderUI(HTML(i()[["rater_participant_counts_p"]]))

We used r renderText(i()[["unit_p"]]), r renderText(i()[["type_p"]]), r renderText(i()[["model_p"]]) random effects model, and we found [interpret the correlation] agreement among r renderText(i()[["unit_p2"]]), r renderText(i()[["icc.name"]]) = r renderText(i()[["value"]]), 95% CI = [r renderText(i()[["lbound_p"]]), r renderText(i()[["ubound_p"]])].

renderUI(
  if (i()["engine"] == "lme4") {
    HTML(
      '<div id="section-refs">
      <div id="section-ref-lme4">
      <p>Bates, D., Mächler, M., Bolker, B., &amp; Walker, S. (2015). 
      Fitting linear mixed-effects models using lme4. 
      <em>Journal of Statistical Software</em>, <em>67</em>(1), 1–48. 
      <a href="https://doi.org/10.18637/jss.v067.i01">
      https://doi.org/10.18637/jss.v067.i01</a></p>
      </div>
      <div id="section-ref-R-irr">
      <p>Gamer, M., Lemon, J., Fellows, I., &amp; Singh, P. (2019). 
      <em>irr: Various coefficients of interrater reliability and agreement</em>. 
      Retrieved from <a href="https://CRAN.R-project.org/package=irr">
      https://CRAN.R-project.org/package=irr</a></p>
      </div>
      </div>'
  )
  } else {
    HTML(
      '<div id="section-refs">
      <div id="section-ref-R-irr">
      <p>Gamer, M., Lemon, J., Fellows, I., &amp; Singh, P. (2019). 
      <em>irr: Various coefficients of interrater reliability and agreement</em>. 
      Retrieved from <a href="https://CRAN.R-project.org/package=irr">
      https://CRAN.R-project.org/package=irr</a></p>
      </div>
      </div>'
    )
  }
)

Column {data-width=450}

ICC printout

renderPrint(runICC())
renderPrint(cat(
  "ICCBot details", "\n",
  "iccbot version: ", format(utils::packageVersion("iccbot")), "\n",
  "Variance computed by package: ", i()[["engine"]], "\n",
  "N ratings: ", i()[["n_ratings"]], "\n",
  "N missing ratings: ", i()[["n_ratings_missing"]], sep = ""
))

First six rows of data

renderPrint(head(getData()))

Bot Settings and ICC Interpretation

`r "Column {data-width=600}"`

`r "Measurement {.no-title}"`

What is the ICC

The ICC is the intraclass correlation coefficient. It provides a way to measure the correlation of data within measurement units (classes). For interrater reliability checks, it estimates how similar raters’ scores are within each measurement unit. See the interpretation section below for a longer description of the ICC.

What settings should I choose?

Repeated raters (two-way) vs. unique raters (one-way)

If raters only ever evaluate one participant each, use no (one-way model).
Otherwise, yes (two-way model).

In the one-way model, the data are repeated or grouped in just one way (participants who have multiple ratings). Hallgren (2012) elaborates:

If a different set of coders is randomly selected from a larger population of coders for each subject then the researcher must use a one-way model. This is called “one-way” because the new random sample of coders for each subject prevents the ICC from accounting for systematic deviations due to specific coders […] or two-way coder × subject interactions […].

In the two-way case, where some raters evaluated multiple participants, there is another decision one can make: Whether the estimated reliability should generalize to new raters (two-way random) or raters should be treated as fixed (two-way mixed). This app does not support the latter option, but it is supported by the psych::ICC() function.

In general, if a new person—a new student, clinician, etc.—could be trained to perform the rating task, the two-way random is a reasonable default. See how Koo and Li (2016) emphasize generalizability for two-way random models:

If we randomly select our raters from a larger population of raters with similar characteristics, 2-way random-effects model is the model of choice. In other words, we choose 2-way random-effects model if we plan to generalize our reliability results to any raters who possess the same characteristics as the selected raters in the reliability study.

Single rating vs. Average rating

If your eventual analysis will use just one rating per participant, use single rating.
If everyone was rated by more than one rater and you plan to add, combine or average the ratings together in your eventual analysis, use average rating.

Yoder and Symons (2010) spell out the difference:

The “single measure” ICC is the interobserver reliability estimate for a single observer for the relevant variable. […] [The] “average measure” ICC is that for the average of the observers’ estimates for the variable. It is only appropriate to use the latter when the investigator has all sessions coded by more than one observer and uses the average score across observers as the variable score to answer the research question. [p. 177]

Note that changing from single-rating to average-rating reliability means that we cannot talk about single scores being reliable—we only know about the reliability of the averages. Shrout and Fleiss (1979), in their landmark survey of ICC types, illustrates this point:

Sometimes the choice of a unit of analysis causes a conflict between reliability considerations and substantive interpretations. A mean of *k* ratings might be needed for reliability, but the generalization of interest might be individuals. For example, Bayes (1972) desired to relate ratings of interpersonal warmth to nonverbal communication variables. She reported the reliability of the warmth ratings based on the judgments of 30 observers on 15 targets. Because the rating variable that she related to the other variables was the mean rating over all 30 observers, she correctly reported the reliability of the mean ratings. With this index, she found that her mean ratings were reliable to .90. When she interpreted her findings, however, she generalized to single observers, not to other groups of 30 observers. This generalization may be problematic, since the reliability of the individual ratings was less than .30—a value the investigator did not report. In such a situation in which the unit of analysis is not the same as the unit generalized to, it is a good idea to report the reliabilities of both units

Agreement vs. Consistency

If the actual numerical scores should be similar between raters, use absolute agreement.
If the instrument is a standardized or diagnostic test, and scores are interpreted with respect to age or sex norms, use absolute agreement.
If (just) the rankings of participants should be similar between raters, use consistency.
If the raters are unique, as in no (one-way model), only agreement is available.

Suppose you have two teachers, and one of the them has a reputation of being a hard grader. They both grade the same five students. The hard grader gives scores of 60%, 68%, 78%, 80%, 85%, and the other teacher gives scores of 80%, 88%, 98%, 100%, 100%. They give the students the same rankings, but they differ in their average score. Because the teachers each rate more than one student, this is a two-way model, and because in most contexts, each student ever one receives one grade on an assignment, we want to know single score reliability. The consistency-based ICC score is ICC(C,1) = .97, so they are almost perfectly reliable at ranking the students. The absolute-agreement based score is ICC(A,1) = .32, so it is very difficult to compare ratings between the teachers. In order to interpret a score, you would want to know who the teacher was or you would want to have the teachers grade on curve (i.e., renormalize them to remove the average score from each teacher).

In the one-way model, where every rating is done by a unique rater, there is no way to assess whether one rater is a consistently harder scorer compared to another rater. The only differences are absolute differences from each other. Therefore, only absolute agreement is available for one-way models.

Examples from our work

Intelligibility ratings. We have naive listeners transcribe recordings of children. Each child is transcribed by two unique listeners; listeners ever only hear one child. We combine these transcriptions into a single score. This situation requires a one-way, agreement-based, average rating ICC.

Language sample coding. We have two students in the lab transcribe interactions between a parent and child. We want to know whether the word counts or utterance counts are similar between transcribers. As a reliability check, both students transcribe the same subset of data, but the eventual analysis on the larger data will use just one transcription per child. This situations requires a two-way, agreement-based, single rating ICC.

Generally speaking, for an interrater reliability “check” situation—where multiple raters score a subset of the overall data but most of the data was scored by just one rater—use single rating.

Interpreting ICC scores

We are estimating the similarity of repeated measurements

The ICC is the intraclass correlation coefficient. It provides a way to measure the correlation of data within measurement units (classes). For example, suppose we give the same assessment to the same 10 children on three occasions. In general, we would want the scores to be correlated within each child, so that a child attains a similar score on each occasion. The ICC estimates this correlation.

Now, let’s change the example from children tested three times to children who visit a research lab once but have their data scored by three different raters (or judges or coders). Then the ICC would measure how similar the scores are within children. If scores are very similar within children, then the differences between judges are small and the judges have high agreement with each other. This is how ICC works as a measure of interrater reliability.

It compares variation in participants to the overall variation in ratings

The ICC shows up frequently in the literature on multilevel or repeated measurement data. Think of children nested in different classrooms or experimental trials nested in a participant. I mention this context because that’s the frame of reference for the texts I quote from.

Snijders and Bosker (1999), thinking about individuals nested in groups, provide two interpretations:

[The ICC] can be interpreted in two ways: it is the correlation between two randomly drawn individuals in one randomly drawn group [i.e, two randomly drawn measures from the same, randomly selected measurement unit], and it also the fraction of total variability that is due to the group level. [p. 48]

In general, the second interpretation is the more common one, and most definitions of ICC talk about the variation between groups versus the variation within groups. So, when Snijders and Bosker say “fraction of total variability,” they have a fraction like the following in mind:

r "$$"
\frac{\text{between-group variation}}{\text{total variation}} = \frac{\text{between-group variation}}{\text{between-group variation} + \text{within-group variation}} r "$$"

In the context of interrater reliability, the groups are the participants who are being rated. Between-group variation is how the participants differ from each other, and within-group variation is how the ratings differ within the participants (i.e., between raters).

Technically, the actual fractions to compute ICC scores are more involved than the one above, as they account for between-rater variation. Still, Yoder and Symons (2010) support the ICC-as-a-proportion interpretation for interrater reliability:

[The] conceptual meaning of the ICC when measuring interobserver agreement is the proportion of the variability in the reliability sample that is due to between-participant variance in true score estimates of the behavior of interest (Shavelson & Webb, 1991). [p. 175]

Kreft and De Leeuw (1998), thinking about individuals nested in groups, do a good job explaining what it means for between-group variation to be low compared to within-group variation; that is, what a low ICC means:

If this intra-class correlation is high, groups are homogeneous and/or very different from each other. […] In general, it is true that if this intra-class correlation is low, groups are only slightly different from each other. If the intra-class correlation is so low that is equal to zero, no group differences exist for the variables of interest. People from the same group are as different from each other on these variables as people across groups are. A zero intra-class correlation means that clustering of the data has no consequences for the relationship between the variables of interest […]. [p. 4]

If your reliability check is showing low ICC scores, the differences between the judges’ ratings are so large that they might as be comparing their ratings for different participants.

References

Hallgren, K. A. (2012). Computing inter-rater reliability for observational data: An overview and tutorial. *Tutorials in Quantitative Methods for Psychology*, *8*(1), 23–34.

Koo, T. K., & Li, M. Y. (2016). A guideline of selecting and reporting intraclass correlation coefficients for reliability research. *Journal of Chiropractic Medicine*, *15*(2), 155–163.

Kreft, I., & Leeuw, J. de. (1998). *Introducing multilevel modeling*. Sage.

Shrout, P. E., & Fleiss, J. L. (1979). Intraclass correlations: Uses in assessing rater reliaility. *Psychological Bulletin*, *86*(2), 420–428.

Snijders, T. A. B., & Bosker, R. J. (1999). *Multilevel analysis: An introduction to basic and advanced multilevel modeling*. Sage.

Yoder, P., & Symons, F. (2010). *Observational measurement of behavior*. Springer Publishing Company.

`r "Column {data-width=300}"`

tjmahr/iccbot documentation built on Dec. 2, 2020, 3:30 a.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

tjmahr/iccbot
A 'Shiny' App for ICC Statistics

In tjmahr/iccbot: A 'Shiny' App for ICC Statistics

Column {.sidebar}

Run ICC Bot

Column {data-width=400}

ICC Bot says

Column {data-width=450}

ICC printout

First six rows of data

Bot Settings and ICC Interpretation

`r "Column {data-width=600}"`

`r "Measurement {.no-title}"`

What is the ICC

What settings should I choose?

Repeated raters (two-way) vs. unique raters (one-way)

Single rating vs. Average rating

Agreement vs. Consistency

Examples from our work

Interpreting ICC scores

We are estimating the similarity of repeated measurements

It compares variation in participants to the overall variation in ratings

References

`r "Column {data-width=300}"`

R Package Documentation

Browse R Packages

We want your feedback!

tjmahr/iccbot A 'Shiny' App for ICC Statistics

In tjmahr/iccbot: A 'Shiny' App for ICC Statistics

Column {.sidebar}

Run ICC Bot

Column {data-width=400}

ICC Bot says

Column {data-width=450}

ICC printout

First six rows of data

Bot Settings and ICC Interpretation

r "Column {data-width=600}"

r "Measurement {.no-title}"

What is the ICC

What settings should I choose?

Repeated raters (two-way) vs. unique raters (one-way)

Single rating vs. Average rating

Agreement vs. Consistency

Examples from our work

Interpreting ICC scores

We are estimating the similarity of repeated measurements

It compares variation in participants to the overall variation in ratings

References

r "Column {data-width=300}"

R Package Documentation

Browse R Packages

We want your feedback!

tjmahr/iccbot
A 'Shiny' App for ICC Statistics

`r "Column {data-width=600}"`

`r "Measurement {.no-title}"`

`r "Column {data-width=300}"`