R/iota_version_2.R
In iotarelr: Iota Inter Coder Reliability for Content Analysis

Documented in compute_iota2 get_iota2_measures get_patterns

.onUnload<-function(libpath){
  library.dynam.unload("iotarelr",libpath)
}

#'Get patterns
#'
#'Auxiliary function written in \code{R} for providing the necessary information
#'about the patterns generated by raters. This function produces the
#'input for the EM-algorithm.
#'
#'@param data \code{Matrix} or \code{data.frame} containing the ratings. The
#'cases are in the rows and the raters are in the columns. Characters in the cells
#'are supported. At least two raters are necessary.
#'@param categorical_levels \code{Vector} containing all possible categories of
#'the content analysis.
#'@return Function returns a \code{list} with the following components:
#'\item{n}{Integer representing the number of different patterns in the data.}
#'\item{shape}{\code{Matrix} containing all unique patterns in in the data.
#'Cells of the matrix are characters.}
#'\item{frq}{\code{Vector} containing the frequencies of the patterns.}
#'\item{count}{\code{Matrix} containing the relative frequencies of the
#'categories within each pattern. The number of rows equals the number of
#'patterns. The number of columns equals the number of categories.}
#'@export
get_patterns<-function(data,categorical_levels){

  n_rater<-ncol(data)
  pattern_shape<-unique(data)
  pattern_shape<-matrix(sapply(pattern_shape,as.character),ncol=n_rater)
  pattern_id<-vector(length = nrow(data))
  for(i in 1:nrow(data)){
    pattern_id[i]<-paste0(sapply(data[i,],as.character),collapse = "")
  }

  pattern_frq<-vector(length = nrow(pattern_shape))
  for(i in 1:nrow(pattern_shape)){
    tmp<-pattern_id
    tmp<-pattern_id==paste0(pattern_shape[i,],collapse = "")
    pattern_frq[i]<-sum(tmp)
  }

  internal_count_frq<-matrix(data=NA,nrow = nrow(pattern_shape),ncol=length(categorical_levels))
  colnames(internal_count_frq)<-categorical_levels
  for (k in categorical_levels){
    tmp<-pattern_shape
    tmp<-tmp==k
    internal_count_frq[,k]<-rowSums(tmp)
  }
  internal_count_frq<-internal_count_frq/n_rater

  pattern<-NULL
  pattern["n"]<-list(nrow(pattern_shape))
  pattern["shape"]<-list(pattern_shape)
  pattern["frq"]<-list(pattern_frq)
  pattern["count"]<-list(internal_count_frq)
  return(pattern)
}

#' Computes Iota and its elements in version 2
#'
#' Fits a model of Iota2 to the data
#'
#' @param data Data for which the elements should be estimated. Data must be
#' an object of type \code{data.frame} or \code{matrix} with cases in the rows and
#' raters in the columns.
#' @param random_starts An integer for the number of random starts for the
#' EM algorithm.
#' @param max_iterations An integer for the maximum number of iterations within
#' the EM algorithm.
#' @param cr_rel_change Positive numeric value for defining the convergence of the
#' EM algorithm.
#' @param con_step_size \code{Double} for specifying the size for increasing or
#' decreasing the probabilities during the conditioning stage of estimation.
#' This value should not be less than 1e-3.
#' @param con_random_starts \code{Integer} for the number of random starts
#' within the conditioning stage.
#' @param con_max_iterations \code{Integer} for the maximum number of iterations
#' during the conditioning stage.
#' @param con_rel_convergence \code{Double} for determining the convergence
#' criterion during the conditioning stage. The algorithm stops if the relative change
#' is smaller than this criterion.
#' @param fast \code{Bool} If \code{TRUE} a fast estimation is applied during the
#' condition stage. This option ignores all parameters beginning with "con_".
#' If \code{FALSE} the estimation described in Berding and
#' Pargmann (2022) is used. Default is \code{TRUE}.
#'@param trace \code{TRUE} for printing progress information on the console.
#'\code{FALSE} if this information is not to be printed.
#'@param con_trace \code{TRUE} for printing progress information on the console
#'during estimations in the conditioning stage. \code{FALSE} if this information
#'is not to be printed.
#' @param b_min Value ranging between 0 and 1, determining the minimal size of
#' the categories for checking if boundary values occurred. The algorithm tries
#' to select solutions that are not considered to be boundary values.
#' @return Returns a \code{list} with the following three components:
#' The first component \code{estimates_categorical_level} comprises all
#' elements that describe the ratings on a categorical level. The elements are
#' sub-divided into raw estimates and chance-corrected estimates.
#' \describe{
#' \item{\code{raw_estimates}}{
#' \describe{
#' \item{\code{alpha_reliability:}}{A vector containing the Alpha
#' Reliabilities for each category. These values represent probabilities.}
#' \item{\code{beta_reliability:}}{A vector containing the Beta Reliabilities for each
#' category. These values represent probabilities.}
#' \item{\code{assignment_error_matrix:}}{Assignment Error Matrix containing the conditional
#' probabilities for assigning a unit of category i to categories 1 to n.}
#'  \item{\code{iota:}}{A vector containing the Iota values for each category.}
#'  \item{\code{iota_error_1:}}{A vector containing the Iota Error Type I values for each category.}
#'  \item{\code{iota_error_2:}}{A vector containing the Iota Error Type II values for each category.}
#' }}
#'
#' \item{\code{elements_chance_corrected}}{
#' \describe{
#' \item{\code{alpha_reliability:}}{A vector containing the chance-corrected Alpha Reliabilities for each category.}
#' \item{\code{beta_reliability:}}{A vector containing the chance-corrected Beta Reliabilities for each category.}
#' }}
#'
#' The second component \code{estimates_scale_level} contains elements for
#' describing the quality of the ratings on a scale level. It comprises the
#' following elements:
#' \describe{
#' \item{\code{iota_index:}}{The Iota Index, representing the reliability on a scale level.}
#' \item{\code{iota_index_d4:}}{The Static Iota Index, which is a transformation of the original Iota Index,
#' in order to consider the uncertainty of estimation.}
#' \item{\code{iota_index_dyn2:}}{The Dynamic Iota Index, which is a transformation of the original Iota Index,
#' in order to consider the uncertainty of estimation.}
#' }
#'
#' The third component \code{information} contains important information
#' regarding the parameter estimation. It comprises the following elements:
#'\describe{
#' \item{\code{log_likelihood:}}{Log-likelihood of the best solution.}
#' \item{\code{convergence:}}{If estimation converged 0, otherwise 1.}
#' \item{\code{est_true_cat_sizes:}}{Estimated categorical sizes. This is the estimated amount of the categories.}
#' \item{\code{conformity:}}{\code{0} if the solution is in line with assumptions of weak superiority.
#'  A number greater 0 indicates the number of violations of the assumption
#'  of weak superiority.}
#'  \item{\code{random_starts:}}{Numer of random starts for the EM algorithm.}
#' \item{\code{boundaries:}}{\code{False} if the best solution does not contain boundary values.
#' \code{True} if the best solution does contain boundary values}
#' \item{\code{p_boundaries:}}{Percentage of solutions with boundary values during the estimation.}
#' \item{\code{call:}}{Name of the function that created the object.}
#' \item{\code{n_rater:}}{Number of raters.}
#' \item{\code{n_cunits:}}{Number of coding units.}
#' }}
#'@references  Florian Berding and Julia Pargmann (2022).Iota Reliability Concept
#'of the Second Generation. Measures for Content Analysis Done by
#'Humans or Artificial Intelligences. Berlin: Logos.
#'https://doi.org/10.30819/5581
#' @export
compute_iota2<-function(data,
                        random_starts=10,
                        max_iterations=5000,
                        cr_rel_change=1e-12,
                        con_step_size = 1e-4,
                        con_rel_convergence=1e-12,
                        con_max_iterations=5000,
                        con_random_starts=5,
                        b_min=.01,
                        fast=TRUE,
                        trace=TRUE,
                        con_trace=FALSE){

  #---------------------------------------------------------------------------
  #Checking input data
  if((is.data.frame(data)==TRUE |
     is.matrix(data==TRUE))==FALSE){
    stop("data must be a matrix or data.frame")
  }

  #Converting input data
  data<-as.data.frame(data)
  categorical_levels<-names(table(data[1]))
  for(i in 2:ncol(data)){
    tmp<-names(table(data[i]))
    for(j in tmp)
      if(j %in% categorical_levels==FALSE){
        categorical_levels<-c(categorical_levels,j)
      }
  }
  categorical_levels<-sort(categorical_levels,decreasing = FALSE)

  for (i in 1:ncol(data)){
    data[[i]]<-factor(x=lapply(data[[i]],as.character),levels=categorical_levels)
  }

  n_rater=ncol(data)
  n_categories<-length(categorical_levels)
  N=nrow(data)

  #-------------------------------------------------------------------------
  #Identifying patterns
  patterns<-get_patterns(data,categorical_levels)

  #-------------------------------------------------------------------------
  #Performing parameter estimations
  cons_matrix<-matrix(data=FALSE,nrow=n_categories,ncol=n_categories)
  diag(cons_matrix)<-TRUE
  estimates_list<-EM_algo_c(obs_pattern_shape=patterns$shape,
                            obs_pattern_frq=patterns$frq,
                            obs_internal_count=patterns$count,
                            categorical_levels=categorical_levels,
                            random_starts=random_starts,
                            max_iterations=max_iterations,
                            rel_convergence=cr_rel_change,
                            con_step_size = con_step_size,
                            con_random_starts=con_random_starts,
                            con_max_iterations = con_max_iterations,
                            con_rel_convergence = con_rel_convergence,
                            fast=fast,
                            trace=trace,
                            con_trace = con_trace)
  #-------------------------------------------------------------------------
  #Summarizing estimates
  summary<-matrix(data=NA,nrow=random_starts,ncol=6)
  for(i in 1:random_starts){
    summary[i,1]<-estimates_list[[i]]$log_likelihood
    summary[i,2]<-estimates_list[[i]]$convergence
    summary[i,3]<-i
  }

  #Check for correct solutions
  estimates_list_tmp<-estimates_list
  compare_sum<-0
  for(i in 1:length(estimates_list_tmp)){
    estimates_list_tmp[[i]]["con_violations"]<-check_conformity_c(estimates_list_tmp[[i]]$aem)
    summary[i,4]<-estimates_list_tmp[[i]]$con_violations
    summary[i,5]<-max(estimates_list_tmp[[i]]$categorial_sizes)
    summary[i,6]<-min(estimates_list_tmp[[i]]$categorial_sizes)
  }

  #Selection of the best estimates
  #Reducing the estimates to parameters that are as compliant as possible
  summary_selected_I<-subset(summary,summary[,6]>b_min)
  if(nrow(summary_selected_I)!=0){
  index_min<-summary_selected_I[match(x=min(summary_selected_I[,1],na.rm = TRUE),
                                      table=summary_selected_I[,1]),3]
  boundaries=FALSE
  } else {
    index_min<-summary[match(x=min(summary[,1],na.rm = TRUE),
                                        table=summary[,1]),3]
  boundaries=TRUE
  }
  estimates<-estimates_list_tmp[[index_min]]
  p_boundaries<-(length(estimates_list_tmp)-nrow(summary_selected_I))/length(estimates_list_tmp)

  #---------------------------------------------------------------------------
  #Assigning the best solution

  #Assignment-Error-Matrix
  aem=estimates$aem
  colnames(aem)=categorical_levels
  rownames(aem)=categorical_levels

  #Class Sizes
  #p_classes<-estimates$categorial_sizes
  #names(p_classes)<-categorical_levels

  #-------------------------------------------------------------------------
  #Summary
  #Request Reliability Estimates
  results<-NULL
  results<-get_iota2_measures(aem = aem,
                              categorical_sizes = estimates$categorial_sizes,
                              categorical_levels = categorical_levels
  )
  #Summarizing central information
  Esimtates_Information<-NULL
  Esimtates_Information["log_likelihood"]<-list(estimates$log_likelihood)
  Esimtates_Information["random_starts"]<-list(random_starts)
  Esimtates_Information["iteration"]<-list(estimates$iteration)
  Esimtates_Information["convergence"]<-list(estimates$convergence)
  Esimtates_Information["est_true_cat_sizes"]<-list(estimates$categorial_sizes)
  Esimtates_Information["conformity"]<-list(estimates$con_violations)
  Esimtates_Information["boundaries"]<-list(boundaries)
  Esimtates_Information["p_boundaries"]<-list(p_boundaries)
  Esimtates_Information["call"]<-list("compute_iota2")
  Esimtates_Information["n_rater"]<-list(ncol(data))
  Esimtates_Information["n_cunits"]<-list(N)
  Esimtates_Information["estimates_list"]<-list(estimates_list_tmp)
  results["information"]<-list(Esimtates_Information)

  #Assigning class
  class(results)<-"iotarelr_iota2"
  return(results)
}

#' Get Iota2 Measures
#'
#' Function for calculating the elements of the Iota Concept 2
#'
#' @param aem Assignment Error Matrix.
#' @param categorical_sizes Probabilities for the different categories to occur.
#' @param categorical_levels \code{Vector} containing all possible categories of
#' the content analysis.
#' @return Returns a \code{list} of all measures belonging to the Iota Concept
#' of the second generation.
#' The first component \code{estimates_categorical_level} comprises all
#' elements that describe the ratings on a categorical level. The elements are
#' sub-divided into raw estimates and chance-corrected estimates.
#' \describe{
#' \item{\code{raw_estimates}}{
#' \describe{
#' \item{\code{iota:}}{A vector containing the Iota values for each category.}
#'  \item{\code{iota_error_1:}}{A vector containing the Iota Error Type I values for each category.}
#'  \item{\code{iota_error_2:}}{A vector containing the Iota Error Type II values for each category.}
#' \item{\code{alpha_reliability:}}{A vector containing the Alpha
#' Reliabilities for each category. These values represent probabilities.}
#' \item{\code{beta_reliability:}}{A vector containing the Beta Reliabilities for each
#' category. These values represent probabilities.}
#' \item{\code{assignment_error_matrix:}}{Assignment Error Matrix containing the conditional
#' probabilities for assigning a unit of category i to categories 1 to n.}
#' }}
#' \item{\code{elements_chance_corrected}}{
#' \describe{
#' \item{\code{alpha_reliability:}}{A vector containing the chance-corrected Alpha Reliabilities for each category.}
#' \item{\code{beta_reliability:}}{A vector containing the chance-corrected Beta Reliabilities for each category.}
#' }}
#' }
#' The second component \code{estimates_scale_level} contains elements for
#' describing the quality of the ratings on a scale level. It comprises the
#' following elements:
#' \describe{
#' \item{\code{iota_index:}}{The Iota Index, representing the reliability on a scale level.}
#' \item{\code{iota_index_d4:}}{The Static Iota Index, which is a transformation of the original Iota Index,
#' in order to consider the uncertainty of estimation.}
#' \item{\code{iota_index_dyn2:}}{The Dynamic Iota Index, which is a transformation of the original Iota Index,
#' in order to consider the uncertainty of estimation.}
#' }
 #'@references  Florian Berding and Julia Pargmann (2022).Iota Reliability Concept
#'of the Second Generation. Measures for Content Analysis Done by
#'Humans or Artificial Intelligences. Berlin: Logos.
#'https://doi.org/10.30819/5581
#' @export
get_iota2_measures<-function(aem,
                             categorical_sizes,
                             categorical_levels){
  n_categories=length(categorical_levels)
  categorical_levels<-as.character(categorical_levels)

  #alpha-reliability
  p_alpha_reliability<-as.vector(t(as.matrix(diag(aem))))
  names(p_alpha_reliability)<-categorical_levels
  p_alpha_error<-1-p_alpha_reliability

  #Estimation of beta-error
  p_beta_error<-vector(length = n_categories)
  p_beta_error[]<-1
  names(p_beta_error)<-categorical_levels

  for (k in categorical_levels){
    other_k<-subset(categorical_levels,categorical_levels!=k)
    p_beta_error_condition<-0
    p_beta_error_target<-0
    for(k2 in other_k){
      p_beta_error_condition<-p_beta_error_condition+categorical_sizes[k2]*(1-aem[k2,k2])
      p_beta_error_target<-p_beta_error_target+categorical_sizes[k2]*aem[k2,k]
    }
    if(p_beta_error_condition>0){
      p_beta_error[k]<-p_beta_error_target/p_beta_error_condition
    }
    if(p_beta_error_condition<=0){
      p_beta_error[k]<-0
    }
  }
  p_beta_reliability<-1-p_beta_error

  #Estimating normalized and chance-corrected alpha values for Iota
  alpha<-(p_alpha_reliability-1/n_categories)/(1-1/n_categories)

  #Estimating normalized and chance-corrected beta values for Iota
  beta<-vector(length = n_categories)
  names(beta)<-categorical_levels
  for (k in categorical_levels){
    other_k<-subset(categorical_levels,categorical_levels!=k)
    p_beta_error_condition_Chance<-0
    p_beta_error_target_chance<-0
    for(k2 in other_k){
      p_beta_error_condition_Chance<-p_beta_error_condition_Chance+categorical_sizes[k2]*(1-1/n_categories)
      p_beta_error_target_chance<-p_beta_error_target_chance+categorical_sizes[k2]*1/n_categories
    }
    tmp<-1-p_beta_error_target_chance/p_beta_error_condition_Chance
    beta[k]<-(p_beta_reliability[k]-tmp)/(1-tmp)
    #beta[k]<-abs((p_beta_reliability[k]-tmp)/(1-tmp))
  }

  #Estimating Iota
  beta_error_sum<-vector(length = n_categories)
  for(i in 1:n_categories){
    beta_error_sum[i]<-sum(categorical_sizes*p_alpha_error)-
      categorical_sizes[i]*p_alpha_error[i]
  }
  iota_numerator<-categorical_sizes*p_alpha_reliability
  iota_denominator<-categorical_sizes*p_alpha_reliability+
    categorical_sizes*p_alpha_error+
    p_beta_error*beta_error_sum

  iota<-iota_numerator/iota_denominator
  names(iota)<-categorical_levels

  iota_e1_numerator<-categorical_sizes*(1-p_alpha_reliability)
  iota_e1<-iota_e1_numerator/iota_denominator
  names(iota_e1)<-categorical_levels

  iota_e2<-1-iota-iota_e1
  names(iota_e2)<-categorical_levels

  #Estimation of Iota Index
  iota_index<-categorical_sizes%*%rowSums(abs(
    aem-matrix(data=1/n_categories,
               nrow = n_categories,
               ncol=n_categories)))/(2*(n_categories-1)/n_categories)

  #Estimation of Iota Index dyn=2
  iota_index_dyn2<-iota_index^(1+iota_index^2)

  #Estimation of Iota Index d=4
  d=4
  distances<-matrix(ncol=n_categories,
                    nrow=n_categories,
                    data=NA)
  for(i in 1:n_categories){
    for(j in 1:n_categories){
      distances[i,j]<-abs(aem[i,j]-1/n_categories)^d
    }
  }
  iota_index_d4<-categorical_sizes%*%rowSums(distances)
  max_distance<-1/((1-1/n_categories)^d+(n_categories-1)*(abs(1/n_categories)^d))
  iota_index_d4<-max_distance*iota_index_d4

  #Summary of all Estimates
  Elements_Chance_Corrected<-NULL
  Elements_Chance_Corrected["alpha_reliability"]<-list(alpha)
  Elements_Chance_Corrected["beta_reliability"]<-list(beta)

  Elements_Raw_Estimates<-NULL
  Elements_Raw_Estimates["iota"]<-list(iota)
  Elements_Raw_Estimates["iota_error_1"]<-list(iota_e1)
  Elements_Raw_Estimates["iota_error_2"]<-list(iota_e2)
  Elements_Raw_Estimates["alpha_reliability"]<-list(p_alpha_reliability)
  Elements_Raw_Estimates["beta_reliability"]<-list(p_beta_reliability)
  Elements_Raw_Estimates["assignment_error_matrix"]<-list(aem)

  Estimates_Categorical_Level<-NULL
  Estimates_Categorical_Level["raw_estimates"]<-list(Elements_Raw_Estimates)
  Estimates_Categorical_Level["chance_corrected"]<-list(Elements_Chance_Corrected)

  Estimates_Scale_Level<-NULL
  Estimates_Scale_Level["iota_index"]<-list(iota_index)
  Estimates_Scale_Level["iota_index_d4"]<-list(iota_index_d4)
  Estimates_Scale_Level["iota_index_dyn2"]<-list(iota_index_dyn2)

  results<-NULL
  results["categorical_level"]<-list(Estimates_Categorical_Level)
  results["scale_level"]<-list(Estimates_Scale_Level)

  return(results)
}