R/02_TestItemFunctions.R

Defines functions Dimensionality.ordinal Dimensionality.rated Dimensionality.binary Dimensionality.default Dimensionality stanine.binary stanine.default stanine percentile.binary percentile.default percentile sscore.binary sscore.default sscore passage.binary passage.default passage nrs.binary nrs.default nrs ITBiserial.binary ITBiserial.default ITBiserial BiserialCorrelation ItemTotalCorr.ordinal ItemTotalCorr.binary ItemTotalCorr.default ItemTotalCorr ItemEntropy.ordinal ItemEntropy.binary ItemEntropy.default ItemEntropy ItemThreshold.ordinal ItemThreshold.binary ItemThreshold.default ItemThreshold ItemOdds.binary ItemOdds.default ItemOdds crr.binary crr.default crr TetrachoricCorrelationMatrix.binary TetrachoricCorrelationMatrix.default TetrachoricCorrelationMatrix PolychoricCorrelationMatrix.ordinal PolychoricCorrelationMatrix.default PolychoricCorrelationMatrix polyserial polychoric tetrachoric PhiCoefficient.binary PhiCoefficient.default PhiCoefficient MutualInformation.ordinal MutualInformation.binary MutualInformation.default MutualInformation ItemLift.binary ItemLift.default ItemLift CCRR.nominal CCRR.binary CCRR.default CCRR CSR JSR JCRR.nominal JCRR.binary JCRR.default JCRR JointSampleSize.binary JointSampleSize.default JointSampleSize

Documented in BiserialCorrelation CCRR CCRR.binary CCRR.default CCRR.nominal crr crr.binary crr.default CSR Dimensionality Dimensionality.binary Dimensionality.default Dimensionality.ordinal Dimensionality.rated ITBiserial ITBiserial.binary ITBiserial.default ItemEntropy ItemEntropy.binary ItemEntropy.default ItemEntropy.ordinal ItemLift ItemLift.binary ItemLift.default ItemOdds ItemOdds.binary ItemOdds.default ItemThreshold ItemThreshold.binary ItemThreshold.ordinal ItemTotalCorr ItemTotalCorr.binary ItemTotalCorr.default ItemTotalCorr.ordinal JCRR JCRR.binary JCRR.default JCRR.nominal JointSampleSize JointSampleSize.binary JointSampleSize.default JSR MutualInformation MutualInformation.binary MutualInformation.default MutualInformation.ordinal nrs nrs.binary nrs.default passage passage.binary passage.default percentile percentile.binary percentile.default PhiCoefficient PhiCoefficient.binary PhiCoefficient.default polychoric PolychoricCorrelationMatrix PolychoricCorrelationMatrix.default PolychoricCorrelationMatrix.ordinal polyserial sscore sscore.binary sscore.default stanine stanine.binary stanine.default tetrachoric TetrachoricCorrelationMatrix TetrachoricCorrelationMatrix.binary TetrachoricCorrelationMatrix.default

#' @title Joint Sample Size
#' @description
#' The joint sample size is a matrix whose elements are the number of
#' individuals who responded to each pair of items.
#' @param U Either an object of class "exametrika" or raw data. When raw data is given,
#'   it is converted to the exametrika class with the \code{\link{dataFormat}} function.
#' @param Z Missing indicator matrix of type matrix or data.frame. Values of 1 indicate
#'   observed responses, while 0 indicates missing data.
#' @param w Item weight vector specifying the relative importance of each item.
#' @param na Values to be treated as missing values.
#' @note This function is implemented using a binary data compatibility wrapper and
#'       will raise an error if used with polytomous data.
#' @return Returns a matrix of class c("exametrika", "matrix") where each element (i,j)
#'   represents the number of students who responded to both item i and item j. The
#'   diagonal elements represent the total number of responses for each item.
#' @export
JointSampleSize <- function(U, na = NULL, Z = NULL, w = NULL) {
  UseMethod("JointSampleSize")
}
#' @rdname JointSampleSize
#' @export
JointSampleSize.default <- function(U, na = NULL, Z = NULL, w = NULL) {
  if (!inherits(U, "exametrika")) {
    U <- dataFormat(U, na = na, Z = Z, w = w)
  }
  JointSampleSize.binary(U, na, Z, w)
}

#' @rdname JointSampleSize
#' @export
JointSampleSize.binary <- function(U, na = NULL, Z = NULL, w = NULL) {
  S_jk <- t(U$Z) %*% U$Z
  structure(S_jk, class = c("exametrika", "matrix"))
}

#' @title Joint Correct Response Rate
#' @description
#' The joint correct response rate (JCRR) is the rate of students who passed
#' both items. This function is applicable only to binary response data.
#' For non-binary data, it will automatically redirect to the JSR function
#' with an appropriate message.
#' @param U Either an object of class "exametrika" or raw data. When raw data is given,
#'   it is converted to the exametrika class with the \code{\link{dataFormat}} function.
#' @param Z Missing indicator matrix of type matrix or data.frame. Values of 1 indicate
#'   observed responses, while 0 indicates missing data.
#' @param w Item weight vector specifying the relative importance of each item.
#' @param na Values to be treated as missing values.
#' @note This function is implemented using a binary data compatibility wrapper and
#'       will raise an error if used with polytomous data.
#' @return A matrix of joint correct response rates with exametrika class.
#' Each element (i,j) represents the proportion of students who correctly
#' answered both items i and j.
#' @examples
#' # example code
#' # Calculate JCRR using sample dataset J5S10
#' JCRR(J5S10)
#' @export
JCRR <- function(U, na = NULL, Z = NULL, w = NULL) {
  UseMethod("JCRR")
}

#' @rdname JCRR
#' @export
JCRR.default <- function(U, na = NULL, Z = NULL, w = NULL) {
  if (inherits(U, "exametrika")) {
    switch(U$response.type,
      "binary" = JCRR.binary(U, na = NULL, Z = NULL, w = NULL),
      "rated" = JCRR.nominal(U, na = NULL, Z = NULL, w = NULL),
      "ordinal" = JCRR.nominal(U, na = NULL, Z = NULL, w = NULL),
      "nominal" = JCRR.nominal(U, na = NULL, Z = NULL, w = NULL)
    )
  } else {
    U <- dataFormat(U, na = na, Z = Z, w = w)
    JCRR(U)
  }
}

#' @rdname JCRR
#' @export
JCRR.binary <- function(U, na = NULL, Z = NULL, w = NULL) {
  P_J <- t(U$Z * U$U) %*% (U$Z * U$U) / (t(U$Z) %*% U$Z)
  structure(P_J, class = c("exametrika", "matrix"))
}

#' @rdname JCRR
#' @export
JCRR.nominal <- function(U, na = NULL, Z = NULL, w = NULL) {
  message("JCRR is for binary data only. Using Joint Selection Rate for your polytomous data instead.")
  JSR(U)
}

#' @title Joint Selection Rate
#' @description Calculate the Joint Selection Rate (JSR) for polytomous data.
#'   JSR measures the proportion of respondents who selected specific category
#'   combinations between pairs of items. For each pair of items (j,k),
#'   it returns a contingency table showing the joint probability of selecting
#'   each category combination.
#' @param U Either an object of class "exametrika" or raw data. When raw data is given,
#'   it is converted to the exametrika class with the \code{\link{dataFormat}} function.
#' @param Z Missing indicator matrix of type matrix or data.frame. Values of 1 indicate
#'   observed responses, while 0 indicates missing data.
#' @param w Item weight vector specifying the relative importance of each item.
#' @param na Values to be treated as missing values.
#' @return A list of Joint Selection Rate matrices for each item pair.
#' @examples
#' # example code
#' # Calculate JCRR using sample dataset J5S1000
#' JSR(J5S1000)
#' @export
JSR <- function(U, na = NULL, Z = NULL, w = NULL) {
  if (!inherits(U, "exametrika")) {
    U <- dataFormat(U, na = na, Z = Z, w = w)
  }
  if (U$response.type == "binary") {
    message("JSR is for non-binary data only. Using Joint Correct Response Rate for your binary data instead.")
    JCRR(U)
  }
  nitems <- NCOL(U$Q)
  ncat <- U$categories
  U$Q[U$Z == 0] <- NA
  JSR <- vector("list", nitems)
  for (j in 1:nitems) {
    JSR[[j]] <- vector("list", nitems)
    for (k in 1:nitems) {
      tbl <- table(U$Q[, j], U$Q[, k], useNA = "no") / sum(table(U$Q[, j], U$Q[, k], useNA = "no"))
      colnames(tbl) <- unlist(U$CategoryLabel[k])
      rownames(tbl) <- unlist(U$CategoryLabel[j])
      JSR[[j]][[k]] <- tbl
    }
  }
  return(JSR)
}

#' @title Conditional Selection Rate
#' @description Calculate the Conditional Selection Rate (CSR) for polytomous data.
#'   CSR measures the proportion of respondents who selected a specific category
#'   in item K, given that they selected a particular category in item J.
#' @details
#'   The function returns a nested list structure CSR, where \code{CSR[[j]][[k]]} contains
#'   a matrix of conditional probabilities. In this matrix, the element at row l and
#'   column m represents P(K=m|J=l), which is the probability of selecting category m
#'   for item K, given that category l was selected for item J.
#'
#'   Mathematically, for each cell (l,m) in the \code{CSR[[j]][[k]]} matrix:
#'   \code{CSR[[j]][[k]][l,m] = P(Item K = category m | Item J = category l)}
#'
#'   This is calculated as the number of respondents who selected both category l for
#'   item J and category m for item K, divided by the total number of respondents who
#'   selected category l for item J.
#' @param U Either an object of class "exametrika" or raw data. When raw data is given,
#'   it is converted to the exametrika class with the \code{\link{dataFormat}} function.
#' @param Z Missing indicator matrix of type matrix or data.frame. Values of 1 indicate
#'   observed responses, while 0 indicates missing data.
#' @param w Item weight vector specifying the relative importance of each item.
#' @param na Values to be treated as missing values.
#' @return A list of Joint Selection Rate matrices for each item pair.
#' @examples
#' # example code
#' # Calculate CSR using sample dataset J5S1000
#' CSR(J5S1000)
#'
#' # Extract the conditional selection rates from item 1 to item 2
#' csr_1_2 <- CSR(J5S1000)[[1]][[2]]
#' # This shows the probability of selecting each category in item 2
#' # given that a specific category was selected in item 1
#' @export
CSR <- function(U, na = NULL, Z = NULL, w = NULL) {
  if (!inherits(U, "exametrika")) {
    U <- dataFormat(U, na = na, Z = Z, w = w)
  }
  if (U$response.type == "binary") {
    message("CSR is for non-binary data only. Using Conditional Correct Response Rate for your binary data instead.")
    CCRR(U)
  }
  nitems <- NCOL(U$Q)
  ncat <- U$categories
  U$Q[U$Z == 0] <- NA
  CSR <- vector("list", nitems)
  for (j in 1:nitems) {
    CSR[[j]] <- vector("list", nitems)
    for (k in 1:nitems) {
      tbl <- table(U$Q[, j], U$Q[, k], useNA = "no")
      ccsr <- matrix(NA, nrow = ncat[j], ncol = ncat[k])
      for (l in 1:ncat[j]) {
        ccsr[l, ] <- tbl[l, ] / rowSums(tbl)[l]
      }
      rownames(ccsr) <- U$CategoryLabel[[j]]
      colnames(ccsr) <- U$CategoryLabel[[k]]
      CSR[[j]][[k]] <- ccsr
    }
  }
  return(CSR)
}

#' @title Conditional Correct Response Rate
#' @description
#' The conditional correct response rate (CCRR) represents the ratio of the students
#' who passed Item C (consequent item) to those who passed Item A (antecedent item).
#' This function is applicable only to binary response data.
#' @param U Either an object of class "exametrika" or raw data. When raw data is given,
#'   it is converted to the exametrika class with the \code{\link{dataFormat}} function.
#' @param Z Missing indicator matrix of type matrix or data.frame. Values of 1 indicate
#'   observed responses, while 0 indicates missing data.
#' @param w Item weight vector specifying the relative importance of each item.
#' @param na Values to be treated as missing values.
#' @note This function is implemented using a binary data compatibility wrapper and
#'       will raise an error if used with polytomous data.
#' @return A matrix of conditional correct response rates with exametrika class.
#' Each element (i,j) represents the probability of correctly answering item j
#' given that item i was answered correctly.
#' @examples
#' # example code
#' # Calculate CCRR using sample dataset J5S10
#' CCRR(J5S10)
#' @export

CCRR <- function(U, na = NULL, Z = NULL, w = NULL) {
  UseMethod("CCRR")
}

#' @rdname CCRR
#' @export
CCRR.default <- function(U, na = NULL, Z = NULL, w = NULL) {
  if (inherits(U, "exametrika")) {
    switch(U$response.type,
      "binary" = CCRR.binary(U, na = NULL, Z = NULL, w = NULL),
      "rated" = CCRR.nominal(U, na = NULL, Z = NULL, w = NULL),
      "ordinal" = CCRR.nominal(U, na = NULL, Z = NULL, w = NULL),
      "nominal" = CCRR.nominal(U, na = NULL, Z = NULL, w = NULL)
    )
  } else {
    U <- dataFormat(U, na = na, Z = Z, w = w)
    JCRR(U)
  }
}

#' @rdname CCRR
#' @export
CCRR.binary <- function(U, na = NULL, Z = NULL, w = NULL) {
  Z <- U$Z
  OneJ <- rep(1, ncol(U$U))
  Pj <- JCRR(U)
  p <- crr(U)
  P_C <- Pj / (p %*% t(OneJ))
  structure(P_C, class = c("exametrika", "matrix"))
}

#' @rdname CCRR
#' @export
CCRR.nominal <- function(U, na = NULL, Z = NULL, w = NULL) {
  message("CCRR is for binary data only. Conditional Selection Rate for your polytomous data instead.")
  CSR(U)
}
#' @title Item Lift
#' @description
#' The lift is a commonly used index in a POS data analysis.
#' The item lift of Item k to Item j is defined as follow:
#' \eqn{ l_{jk} = \frac{p_{k\mid j}}{p_k} }
#' This function is applicable only to binary response data.
#' @param U Either an object of class "exametrika" or raw data. When raw data is given,
#'   it is converted to the exametrika class with the \code{\link{dataFormat}} function.
#' @param Z Missing indicator matrix of type matrix or data.frame. Values of 1 indicate
#'   observed responses, while 0 indicates missing data.
#' @param w Item weight vector specifying the relative importance of each item.
#' @param na Values to be treated as missing values.
#' @note This function is implemented using a binary data compatibility wrapper and
#'       will raise an error if used with polytomous data.
#' @return A matrix of item lift values with exametrika class.
#' Each element (j,k) represents the lift value of item k given item j,
#' which indicates how much more likely item k is to be correct given that
#' item j was answered correctly.
#' @references Brin, S., Motwani, R., Ullman, J., & Tsur, S. (1997). Dynamic itemset counting and
#' implication rules for market basket data. In Proceedings of ACM SIGMOD International Conference
#' on Management of Data (pp. 255–264). https://dl.acm.org/doi/10.1145/253262.253325
#' @examples
#' # example code
#' # Calculate ItemLift using sample dataset J5S10
#' ItemLift(J5S10)
#' @export
ItemLift <- function(U, na = NULL, Z = NULL, w = NULL) {
  UseMethod("ItemLift")
}

#' @rdname ItemLift
#' @export
ItemLift.default <- function(U, na = NULL, Z = NULL, w = NULL) {
  if (!inherits(U, "exametrika")) {
    U <- dataFormat(U, na = na, Z = Z, w = w)
  }
  if (U$response.type != "binary") {
    response_type_error(U$response.type, "ItemLift")
  }
  ItemLift.binary(U, na, Z, w)
}

#' @rdname ItemLift
#' @export
ItemLift.binary <- function(U, na = NULL, Z = NULL, w = NULL) {
  OneJ <- rep(1, ncol(U$U))
  Pc <- CCRR(U)
  p <- crr(U)
  P_L <- Pc / (OneJ %*% t(p))
  structure(P_L, class = c("exametrika", "matrix"))
}

#' @title Mutual Information
#' @description
#' Mutual Information is a measure that represents the degree of interdependence
#' between two items. This function is applicable to both binary and polytomous response data.
#' The measure is calculated using the joint probability distribution of responses
#' between item pairs and their marginal probabilities.
#' @details
#' For binary data, the following formula is used:
#' \deqn{
#' MI_{jk} = p_{00} \log_2 \frac{p_{00}}{(1-p_j)(1-p_k)} + p_{01} \log_2 \frac{p_{01}}{(1-p_j)p_k}
#'  + p_{10} \log_2 \frac{p_{10}}{p_j(1-p_k)} + p_{11} \log_2 \frac{p_{11}}{p_jp_k}
#' }
#' Where:
#' \itemize{
#'   \item \eqn{p_{00}} is the joint probability of incorrect responses to both items j and k
#'   \item \eqn{p_{01}} is the joint probability of incorrect response to item j and correct to item k
#'   \item \eqn{p_{10}} is the joint probability of correct response to item j and incorrect to item k
#'   \item \eqn{p_{11}} is the joint probability of correct responses to both items j and k
#' }
#'
#' For polytomous data, the following formula is used:
#' \deqn{MI_{jk} = \sum_{j=1}^{C_j}\sum_{k=1}^{C_k}p_{jk}\log \frac{p_{jk}}{p_{j.}p_{.k}}}
#'
#' The base of the logarithm can be the number of rows, number of columns, min(rows, columns),
#' base-10 logarithm, natural logarithm (e), etc.
#' @param U Either an object of class "exametrika" or raw data. When raw data is given,
#'   it is converted to the exametrika class with the \code{\link{dataFormat}} function.
#' @param Z Missing indicator matrix of type matrix or data.frame. Values of 1 indicate
#'   observed responses, while 0 indicates missing data.
#' @param w Item weight vector specifying the relative importance of each item.
#' @param na Values to be treated as missing values.
#' @param base The base for the logarithm. Default is 2. For polytomous data,
#'   you can use "V" to set the base to min(rows, columns), "e" for natural logarithm (base e),
#'   or any other number to use that specific base.
#' @note This function is implemented using a binary data compatibility wrapper and
#'       will raise an error if used with polytomous data.
#' @return A matrix of mutual information values with exametrika class.
#' Each element (i,j) represents the mutual information between items i and j,
#' measured in bits. Higher values indicate stronger interdependence between items.
#' @examples
#' # example code
#' # Calculate Mutual Information using sample dataset J15S500
#' MutualInformation(J15S500)
#' @export
MutualInformation <- function(U, na = NULL, Z = NULL, w = NULL, base = 2) {
  UseMethod("MutualInformation")
}

#' @rdname MutualInformation
#' @export
MutualInformation.default <- function(U, na = NULL, Z = NULL, w = NULL, base = 2) {
  if (inherits(U, "exametrika")) {
    switch(U$response.type,
      "binary" = MutualInformation.binary(U, na, Z, w, base = base),
      "rated" = MutualInformation.ordinal(U, na, Z, w, base = base),
      "ordinal" = MutualInformation.ordinal(U, na, Z, w, base = base),
      "nominal" = MutualInformation.ordinal(U, na, Z, w, base = base)
    )
  } else {
    U <- dataFormat(U, na = na, Z = Z, w = w)
    MutualInformation(U, na, Z, w, base = base)
  }
}

#' @rdname MutualInformation
#' @export
MutualInformation.binary <- function(U, na = NULL, Z = NULL, w = NULL, base = 2) {
  p <- crr(U)
  # Calculate joint response matrix
  S <- list()
  S$S_11 <- t(U$Z * U$U) %*% (U$Z * U$U)
  S$S_10 <- t(U$Z * U$U) %*% (U$Z * (1 - U$U))
  S$S_01 <- t(U$Z * (1 - U$U)) %*% (U$Z * U$U)
  S$S_00 <- t(U$Z * (1 - U$U)) %*% (U$Z * (1 - U$U))

  # Calculate joint probability matrix
  P <- lapply(S, function(x) x / (t(U$Z) %*% U$Z))

  # Calculate lift matrix
  L <- list()
  L$L_11 <- P$S_11 / (p %*% t(p))
  L$L_10 <- P$S_10 / (p %*% t(1 - p))
  L$L_01 <- P$S_01 / ((1 - p) %*% t(p))
  L$L_00 <- P$S_00 / ((1 - p) %*% t(1 - p))

  # Calculate mutual information
  MI <- P$S_00 * log(L$L_00, base = 2) +
    P$S_01 * log(L$L_01, base = 2) +
    P$S_10 * log(L$L_10, base = 2) +
    P$S_11 * log(L$L_11, base = 2)

  # Adjust diagonal elements
  diag(MI) <- diag(P$S_00 * log(L$L_00, base = 2) +
    P$S_11 * log(L$L_11, base = 2))

  structure(MI, class = c("exametrika", "matrix"))
}

#' @rdname MutualInformation
#' @export
MutualInformation.ordinal <- function(U, na = NULL, Z = NULL, w = NULL, base = 2) {
  if (is.character(base)) {
    if (base != "V" & base != "v" & base != "e") {
      stop("The base of the logarithm must be a number, 'V', or 'e'.")
    }
  }
  nitems <- NCOL(U$Z)
  ncat <- apply(U$Q, 2, max)
  mat <- matrix(ncol = nitems, nrow = nitems)
  for (i in 1:nitems) {
    for (j in 1:nitems) {
      x <- U$Q[, i]
      y <- U$Q[, j]
      x[x == -1] <- NA
      y[y == -1] <- NA
      pairwise <- !is.na(x + y)
      x <- x[pairwise]
      y <- y[pairwise]
      tbl <- table(x, y) / sum(length(x))

      if (base == "V" | base == "v") {
        tei <- min(ncat[i], ncat[j])
      } else if (base == "e") {
        tei <- exp(1)
      } else {
        tei <- as.numeric(base)
      }
      mi <- 0
      cSum <- colSums(tbl)
      rSum <- rowSums(tbl)
      for (k in 1:NROW(tbl)) {
        for (m in 1:NCOL(tbl)) {
          mi <- mi + (tbl[k, m] * (log(tbl[k, m] / rSum[k] / cSum[m], base = tei)))
        }
      }
      mat[i, j] <- mi
    }
  }
  return(mat)
}

#' @title Phi-Coefficient
#' @description
#' The phi coefficient is the Pearson's product moment correlation coefficient
#' between two binary items. This function is applicable only to binary response data.
#' The coefficient ranges from -1 to 1, where 1 indicates perfect positive correlation,
#' -1 indicates perfect negative correlation, and 0 indicates no correlation.
#' @param U Either an object of class "exametrika" or raw data. When raw data is given,
#'   it is converted to the exametrika class with the \code{\link{dataFormat}} function.
#' @param Z Missing indicator matrix of type matrix or data.frame. Values of 1 indicate
#'   observed responses, while 0 indicates missing data.
#' @param w Item weight vector specifying the relative importance of each item.
#' @param na Values to be treated as missing values.
#' @note This function is implemented using a binary data compatibility wrapper and
#'       will raise an error if used with polytomous data.
#' @return A matrix of phi coefficients with exametrika class.
#' Each element (i,j) represents the phi coefficient between items i and j.
#' The matrix is symmetric with ones on the diagonal.
#' @examples
#' # example code
#' # Calculate Phi-Coefficient using sample dataset J15S500
#' PhiCoefficient(J15S500)
#' @export
PhiCoefficient <- function(U, na = NULL, Z = NULL, w = NULL) {
  UseMethod("PhiCoefficient")
}

#' @rdname PhiCoefficient
#' @export
PhiCoefficient.default <- function(U, na = NULL, Z = NULL, w = NULL) {
  if (!inherits(U, "exametrika")) {
    U <- dataFormat(U, na = na, Z = Z, w = w)
  }
  if (U$response.type != "binary") {
    response_type_error(U$response.type, "PhiCoefficient")
  }
  PhiCoefficient.binary(U, na, Z, w)
}

#' @rdname PhiCoefficient
#' @export
PhiCoefficient.binary <- function(U, na = NULL, Z = NULL, w = NULL) {
  p <- crr(U)
  OneS <- rep(1, nrow(U$U))
  OneJ <- rep(1, ncol(U$U))

  # Calculate centered cross-product matrix
  C <- t(U$Z * (U$U - OneS %*% t(p))) %*%
    (U$Z * (U$U - OneS %*% t(p))) /
    (t(U$Z) %*% U$Z - OneJ %*% t(OneJ))

  # Calculate standard deviations
  v <- diag(C)

  # Calculate correlation matrix
  phi <- C / sqrt(v) %*% t(sqrt(v))

  structure(phi, class = c("exametrika", "matrix"))
}

#' @title Tetrachoric Correlation
#' @description
#' Tetrachoric Correlation is superior to the phi coefficient as a measure of the
#' relation of an item pair. See Divgi, 1979; Olsson, 1979;Harris, 1988.
#' @references Divgi, D. R. (1979). Calculation of the tetrachoric correlation coefficient.
#' Psychometrika, 44, 169–172.
#' @references Olsson, U. (1979). Maximum likelihood estimation of the polychoric correlation
#'  coefficient. Psychometrika,44, 443–460.
#' @references Harris, B. (1988). Tetrachoric correlation coefficient. In L. Kotz, & N. L. Johnson
#'  (Eds.), Encyclopedia of statistical sciences (Vol. 9, pp. 223–225). Wiley.
#' @param x binary vector x
#' @param y binary vector y
#' @importFrom mvtnorm pmvnorm
#' @importFrom stats qnorm
#' @importFrom stats pnorm
#' @importFrom stats optimize
#' @return Returns a single numeric value of class "exametrika" representing the
#'   tetrachoric correlation coefficient between the two binary variables. The value
#'   ranges from -1 to 1, where:
#'   \itemize{
#'     \item 1 indicates perfect positive correlation
#'     \item -1 indicates perfect negative correlation
#'     \item 0 indicates no correlation
#'   }
#' @export

tetrachoric <- function(x, y) {
  pairwise <- !is.na(x + y)
  # count 2x2 cells
  x.fac <- factor(x[pairwise], levels = 0:1)
  y.fac <- factor(y[pairwise], levels = 0:1)
  tbl <- table(x.fac, y.fac)
  S00 <- tbl[1, 1]
  S10 <- tbl[2, 1]
  S01 <- tbl[1, 2]
  S11 <- tbl[2, 2]
  if (S00 == 0) {
    S00 <- 0.5
  }
  if (S10 == 0) {
    S10 <- 0.5
  }
  if (S01 == 0) {
    S01 <- 0.5
  }
  if (S11 == 0) {
    S11 <- 0.5
  }
  # calcs tau
  tau_j <- qnorm(1 - mean(x, na.rm = TRUE))
  tau_k <- qnorm(1 - mean(y, na.rm = TRUE))
  ## BVN funcs
  BVN11 <- function(rho, tau_j, tau_k) {
    pmvnorm(upper = c(-tau_j, -tau_k), corr = matrix(c(1, rho, rho, 1), ncol = 2))
  }
  BVN01 <- function(rho, tau_j, tau_k) {
    pnorm(tau_j) - pmvnorm(upper = c(tau_j, tau_k), corr = matrix(c(1, rho, rho, 1), ncol = 2))
  }
  BVN10 <- function(rho, tau_j, tau_k) {
    pnorm(tau_k) - pmvnorm(upper = c(tau_j, tau_k), corr = matrix(c(1, rho, rho, 1), ncol = 2))
  }
  BVN00 <- function(rho, tau_j, tau_k) {
    pmvnorm(upper = c(tau_j, tau_k), corr = matrix(c(1, rho, rho, 1), ncol = 2))
  }
  ## LL
  log_likelihood_phi <- function(rho, tau_j, tau_k, S00, S11, S10, S01) {
    S00 * log(BVN00(rho, tau_j, tau_k)) + S01 * log(BVN01(rho, tau_j, tau_k)) +
      S10 * log(BVN10(rho, tau_j, tau_k)) + S11 * log(BVN11(rho, tau_j, tau_k))
  }
  ret <- optim(
    par = 0, # initial value
    fn = function(x) {
      -log_likelihood_phi(rho = x, tau_j, tau_k, S00, S11, S10, S01)
    },
    lower = -1, # lower limit
    upper = 1, # upper limit
    method = "Brent" # one-dimensional optimization method
  )
  ret <- structure(ret$par, class = c("exametrika"))
  return(ret)
}

#' @title Polychoric Correlation
#' @description
#' Calculate the polychoric correlation coefficient between two polytomous (categorical ordinal) variables.
#' Polychoric correlation estimates the correlation between two theorized normally distributed
#' continuous latent variables from two observed ordinal variables.
#'
#' @param x A polytomous vector (categorical ordinal variable)
#' @param y A polytomous vector (categorical ordinal variable)
#' @return The polychoric correlation coefficient between x and y
#' @details
#' This function handles missing values (coded as -1 or NA) using pairwise deletion.
#' The estimation uses maximum likelihood approach with Brent's method for optimization.
#' @examples
#' # Example with simulated data
#' set.seed(123)
#' x <- sample(1:5, 100, replace = TRUE)
#' y <- sample(1:4, 100, replace = TRUE)
#' polychoric(x, y)
#'
#' @export
polychoric <- function(x, y) {
  x[x == -1] <- NA
  y[y == -1] <- NA
  pairwise <- !is.na(x + y)
  x <- x[pairwise]
  y <- y[pairwise]
  mat <- table(x, y)
  fit <- optim(
    par = 0,
    fn = polychoric_likelihood,
    mat = mat,
    method = "Brent",
    lower = -1,
    upper = 1
  )
  if (fit$convergence != 0) {
    stop("Failed to converge when calculating polychoric correlation. Try with different initial values or check your data.")
  }
  cor <- fit$par
  return(cor)
}


#' @title Polyserial Correlation
#' @description Calculates the polyserial correlation coefficient between a continuous variable and an ordinal variable.
#' @details This function implements Olsson et al.'s ad hoc method for estimating the polyserial correlation
#' coefficient. The method assumes that the continuous variable follows a normal distribution and
#' that the ordinal variable is derived from an underlying continuous normal variable through
#' thresholds.
#' @param x A numeric vector representing the continuous variable.
#' @param y A numeric vector representing the ordinal variable (must be integer values).
#' @return A numeric value representing the estimated polyserial correlation coefficient.
#' @references U.Olsson, F.Drasgow, and N.Dorans (1982).
#' The polyserial correlation coefficient. Psychometrika, 47,337-347.
#' @examples
#' n <- 300
#' x <- rnorm(n)
#' y <- sample(1:5, size = n, replace = TRUE)
#' polyserial(x, y)
#' @importFrom stats dnorm
#' @export
#'
polyserial <- function(x, y) {
  x[x == -1] <- NA
  y[y == -1] <- NA
  pairwise <- !is.na(x + y)
  x <- x[pairwise]
  y <- y[pairwise]
  nobs <- length(y)
  ncat <- max(as.numeric(as.factor(y)))

  tbl <- tabulate(y)
  freq <- tbl / sum(tbl)
  cum_freq <- cumsum(freq)

  thresholds <- qnorm(cum_freq)[-ncat]
  tau <- dnorm(thresholds)
  sum_tau <- sum(tau)

  rxy <- cor(x, y)
  sd_y <- sqrt(var(y) * (nobs - 1) / nobs)
  rho <- rxy * sd_y / sum_tau
  rho <- pmin(pmax(rho, -1), 1)
  return(rho)
}

#' @title Polychoric Correlation Matrix
#' @param U Either an object of class "exametrika" or raw data. When raw data is given,
#'   it is converted to the exametrika class with the \code{\link{dataFormat}} function.
#' @param Z Missing indicator matrix of type matrix or data.frame. Values of 1 indicate
#'   observed responses, while 0 indicates missing data.
#' @param w Item weight vector specifying the relative importance of each item.
#' @param na Values to be treated as missing values.
#' @return A matrix of polychoric correlations with exametrika class.
#' Each element (i,j) represents the polychoric correlation between items i and j.
#' The matrix is symmetric with ones on the diagonal.
#' @examples
#' \donttest{
#' # example code
#' PolychoricCorrelationMatrix(J5S1000)
#' }
#' @export
#'
PolychoricCorrelationMatrix <- function(U, na = NULL, Z = NULL, w = NULL) {
  UseMethod("PolychoricCorrelationMatrix")
}

#' @rdname PolychoricCorrelationMatrix
#' @export
PolychoricCorrelationMatrix.default <- function(U, na = NULL, Z = NULL, w = NULL) {
  if (!inherits(U, "exametrika")) {
    tmp <- dataFormat(U, na = na, Z = Z, w = w)
  }
  if (U$response.type != "ordinal") {
    response_type_error(tmp$response.type, "PolychoricCorrelationMatrix")
  }
  PolychoricCorrelationMatrix.ordinal(U, na, Z, w)
}

#' @rdname PolychoricCorrelationMatrix
#' @export
PolychoricCorrelationMatrix.ordinal <- function(U, na = NULL, Z = NULL, w = NULL) {
  tmp <- dataFormat(data = U, na = na, Z = Z, w = w)
  nitems <- NCOL(tmp$Z)
  tmp$Q[tmp$Z == 0] <- NA
  ret.mat <- matrix(NA, ncol = nitems, nrow = nitems)
  for (i in 1:(nitems - 1)) {
    for (j in (i + 1):nitems) {
      x <- tmp$Q[, i]
      y <- tmp$Q[, j]
      ret.mat[i, j] <- ret.mat[j, i] <- polychoric(x, y)
    }
  }
  diag(ret.mat) <- 1
  return(ret.mat)
}

#' @title Tetrachoric Correlation Matrix
#' @description
#' Calculates the matrix of tetrachoric correlations between all pairs of items.
#' Tetrachoric Correlation is superior to the phi coefficient as a measure of the
#' relation of an item pair. This function is applicable only to binary response data.
#' @param U Either an object of class "exametrika" or raw data. When raw data is given,
#'   it is converted to the exametrika class with the \code{\link{dataFormat}} function.
#' @param Z Missing indicator matrix of type matrix or data.frame. Values of 1 indicate
#'   observed responses, while 0 indicates missing data.
#' @param w Item weight vector specifying the relative importance of each item.
#' @param na Values to be treated as missing values.
#' @note This function is implemented using a binary data compatibility wrapper and
#'       will raise an error if used with polytomous data.
#' @return A matrix of tetrachoric correlations with exametrika class.
#' Each element (i,j) represents the tetrachoric correlation between items i and j.
#' The matrix is symmetric with ones on the diagonal.
#' @examples
#' \donttest{
#' # example code
#' TetrachoricCorrelationMatrix(J15S500)
#' }
#' @export
TetrachoricCorrelationMatrix <- function(U, na = NULL, Z = NULL, w = NULL) {
  UseMethod("TetrachoricCorrelationMatrix")
}

#' @rdname TetrachoricCorrelationMatrix
#' @export
TetrachoricCorrelationMatrix.default <- function(U, na = NULL, Z = NULL, w = NULL) {
  if (!inherits(U, "exametrika")) {
    U <- dataFormat(U, na = na, Z = Z, w = w)
  }
  if (U$response.type != "binary") {
    response_type_error(U$response.type, "TetrachoricCorrelationMatrix")
  }
  TetrachoricCorrelationMatrix.binary(U, na, Z, w)
}

#' @rdname TetrachoricCorrelationMatrix
#' @export
TetrachoricCorrelationMatrix.binary <- function(U, na = NULL, Z = NULL, w = NULL) {
  tmp <- dataFormat(data = U, na = na, Z = Z, w = w)
  tmp$U[tmp$Z == 0] <- NA
  Una <- tmp$U
  m <- ncol(Una)
  mat <- matrix(NA, ncol = m, nrow = m)
  colnames(mat) <- tmp$ItemLabel
  rownames(mat) <- tmp$ItemLabel
  for (i in 1:(m - 1)) {
    for (j in (i + 1):m) {
      x <- Una[, i]
      y <- Una[, j]
      mat[i, j] <- tetrachoric(x, y)
      mat[j, i] <- mat[i, j]
    }
  }
  diag(mat) <- 1
  structure(mat, class = c("exametrika", "matrix"))
}

#' @title Correct Response Rate
#' @description
#' The correct response rate (CRR) is one of the most basic and important
#' statistics for item analysis. This is an index of item difficulty and
#' a measure of how many students out of those who tried an item correctly
#' responded to it. This function is applicable only to binary response data.
#'
#' The CRR for each item is calculated as:
#' \deqn{p_j = \frac{\sum_{i=1}^n z_{ij}u_{ij}}{\sum_{i=1}^n z_{ij}}}
#' where \eqn{z_{ij}} is the missing indicator and \eqn{u_{ij}} is the response.
#' @param U Either an object of class "exametrika" or raw data. When raw data is given,
#'   it is converted to the exametrika class with the \code{\link{dataFormat}} function.
#' @param Z Missing indicator matrix of type matrix or data.frame. Values of 1 indicate
#'   observed responses, while 0 indicates missing data.
#' @param w Item weight vector specifying the relative importance of each item.
#' @param na Values to be treated as missing values.
#' @note This function is implemented using a binary data compatibility wrapper and
#'       will raise an error if used with polytomous data.
#' @return A numeric vector of weighted correct response rates for each item.
#' Values range from 0 to 1, where higher values indicate easier items
#' (more students answered correctly).
#' @examples
#' # Simple binary data
#' U <- matrix(c(1, 0, 1, 1, 0, 1), ncol = 2)
#' crr(U)
#'
#' # using sample datasaet
#' crr(J15S500)
#' @export
crr <- function(U, na = NULL, Z = NULL, w = NULL) {
  UseMethod("crr")
}

#' @rdname crr
#' @export
crr.default <- function(U, na = NULL, Z = NULL, w = NULL) {
  if (!inherits(U, "exametrika")) {
    U <- dataFormat(U, na = na, Z = Z, w = w)
  }
  if (U$response.type != "binary") {
    response_type_error(U$response.type, "crr")
  }
  crr.binary(U, na, Z, w)
}

#' @rdname crr
#' @export
crr.binary <- function(U, na = NULL, Z = NULL, w = NULL) {
  # Create unit vector for summation
  OneS <- rep(1, length = nrow(U$U))

  # Calculate correct response rate
  # (sum of correct responses) / (sum of non-missing responses)
  p <- t(U$Z * U$U) %*% OneS / t(U$Z) %*% OneS

  # Apply item weights
  pW <- U$w * p

  return(pW)
}

#' @title Item Odds
#' @description
#' Item Odds are defined as the ratio of Correct Response Rate to
#' Incorrect Response Rate:
#' \deqn{O_j = \frac{p_j}{1-p_j}}
#' where \eqn{p_j} is the correct response rate for item j.
#' This function is applicable only to binary response data.
#'
#' The odds value represents how many times more likely a correct response is
#' compared to an incorrect response. For example, an odds of 2 means students
#' are twice as likely to answer correctly as incorrectly.
#' @param U Either an object of class "exametrika" or raw data. When raw data is given,
#'   it is converted to the exametrika class with the \code{\link{dataFormat}} function.
#' @param Z Missing indicator matrix of type matrix or data.frame. Values of 1 indicate
#'   observed responses, while 0 indicates missing data.
#' @param w Item weight vector specifying the relative importance of each item.
#' @param na Values to be treated as missing values.
#' @note This function is implemented using a binary data compatibility wrapper and
#'       will raise an error if used with polytomous data.
#' @return A numeric vector of odds values for each item. Values range from 0 to infinity,
#' where:
#' \itemize{
#'   \item odds > 1: correct response more likely than incorrect
#'   \item odds = 1: equally likely
#'   \item odds < 1: incorrect response more likely than correct
#' }
#' @examples
#' # using sample dataset
#' ItemOdds(J5S10)
#' @export
ItemOdds <- function(U, na = NULL, Z = NULL, w = NULL) {
  UseMethod("ItemOdds")
}

#' @rdname ItemOdds
#' @export
ItemOdds.default <- function(U, na = NULL, Z = NULL, w = NULL) {
  if (!inherits(U, "exametrika")) {
    U <- dataFormat(U, na = na, Z = Z, w = w)
  }
  if (U$response.type != "binary") {
    response_type_error(U$response.type, "ItemOdds")
  }
  ItemOdds.binary(U, na, Z, w)
}

#' @rdname ItemOdds
#' @export
ItemOdds.binary <- function(U, na = NULL, Z = NULL, w = NULL) {
  # Calculate correct response rates
  p <- crr(U)

  # Calculate odds
  o <- p / (1 - p)

  return(o)
}


#' @title Item Threshold
#' @description
#' Item threshold is a measure of difficulty based on a standard normal distribution.
#' This function is applicable only to binary response data.
#'
#' The threshold is calculated as:
#' \deqn{\tau_j = \Phi^{-1}(1-p_j)}
#' where \eqn{\Phi^{-1}} is the inverse standard normal distribution function
#' and \eqn{p_j} is the correct response rate for item j.
#'
#' Higher threshold values indicate more difficult items, as they represent the
#' point on the standard normal scale above which examinees tend to answer incorrectly.
#' @param U Either an object of class "exametrika" or raw data. When raw data is given,
#'   it is converted to the exametrika class with the \code{\link{dataFormat}} function.
#' @param Z Missing indicator matrix of type matrix or data.frame. Values of 1 indicate
#'   observed responses, while 0 indicates missing data.
#' @param w Item weight vector specifying the relative importance of each item.
#' @param na Values to be treated as missing values.
#' @note This function is implemented using a binary data compatibility wrapper and
#'       will raise an error if used with polytomous data.
#' @return A numeric vector of threshold values for each item on the standard normal scale.
#' Typical values range from about -3 to 3, where:
#' \itemize{
#'   \item Positive values indicate difficult items
#'   \item Zero indicates items of medium difficulty (50% correct)
#'   \item Negative values indicate easy items
#' }
#' @importFrom stats qnorm
#' @examples
#' # using sample dataset
#' ItemThreshold(J5S10)
#' @export
ItemThreshold <- function(U, na = NULL, Z = NULL, w = NULL) {
  UseMethod("ItemThreshold")
}
#' @export
ItemThreshold.default <- function(U, na = NULL, Z = NULL, w = NULL) {
  if (inherits(U, "exametrika")) {
    switch(U$response.type,
      "binary" = ItemThreshold.binary(U, na, Z, w),
      "rated" = ItemThreshold.ordinal(U, na, Z, w),
      "ordinal" = ItemThreshold.ordinal(U, na, Z, w),
      "nominal" = response_type_error(U$response.type, "ItemThreshold")
    )
  } else {
    U <- dataFormat(U, na = na, Z = Z, w = w)
    ItemThreshold(U)
  }
}

#' @rdname ItemThreshold
#' @export
ItemThreshold.binary <- function(U, na = NULL, Z = NULL, w = NULL) {
  # Calculate correct response rates
  p <- crr(U)
  # Calculate thresholds using inverse normal distribution
  tau <- qnorm(1 - p)
  return(tau)
}

#' @rdname ItemThreshold
#' @export
ItemThreshold.ordinal <- function(U, na = NULL, Z = NULL, w = NULL) {
  nitems <- ncol(U$Q)
  ncat <- U$categories
  U$Q[U$Z == 0] <- NA
  threshold <- vector("list", nitems)
  for (j in 1:nitems) {
    tmp <- qnorm(cumsum(tabulate(U$Q[, j]) / sum(tabulate(U$Q[, j]))))
    threshold[[j]] <- tmp[1:ncat[j] - 1]
  }
  return(threshold)
}


#' @title Item Entropy
#' @description
#' The item entropy is an indicator of the variability or randomness
#' of the responses. This function is applicable only to binary response data.
#'
#' The entropy value represents the uncertainty or information content of the
#' response pattern for each item, measured in bits. Maximum entropy (1 bit)
#' occurs when correct and incorrect responses are equally likely (p = 0.5).
#' @details
#' The item entropy is calculated as:
#' \deqn{e_j = -p_j\log_2p_j-(1-p_j)\log_2(1-p_j)}
#' where \eqn{p_j} is the correct response rate for item j.
#'
#' The entropy value has the following properties:
#' \itemize{
#'   \item Maximum value of 1 bit when p = 0.5 (most uncertainty)
#'   \item Minimum value of 0 bits when p = 0 or 1 (no uncertainty)
#'   \item Higher values indicate more balanced response patterns
#'   \item Lower values indicate more predictable response patterns
#' }
#' @param U Either an object of class "exametrika" or raw data. When raw data is given,
#'   it is converted to the exametrika class with the \code{\link{dataFormat}} function.
#' @param Z Missing indicator matrix of type matrix or data.frame. Values of 1 indicate
#'   observed responses, while 0 indicates missing data.
#' @param w Item weight vector specifying the relative importance of each item.
#' @param na Values to be treated as missing values.
#' @note This function is implemented using a binary data compatibility wrapper and
#'       will raise an error if used with polytomous data.
#' @return A numeric vector of entropy values for each item, measured in bits.
#' Values range from 0 to 1, where:
#' \itemize{
#'   \item 1: maximum uncertainty (p = 0.5)
#'   \item 0: complete certainty (p = 0 or 1)
#'   \item Values near 1 indicate items with balanced response patterns
#'   \item Values near 0 indicate items with extreme response patterns
#' }
#' @examples
#' # using sample dataset
#' ItemEntropy(J5S10)
#' @export
ItemEntropy <- function(U, na = NULL, Z = NULL, w = NULL) {
  UseMethod("ItemEntropy")
}

#' @rdname ItemEntropy
#' @export
ItemEntropy.default <- function(U, na = NULL, Z = NULL, w = NULL) {
  if (inherits(U, "exametrika")) {
    switch(U$response.type,
      "binary" = ItemEntropy.binary(U, na, Z, w),
      "rated" = ItemEntropy.ordinal(U, na, Z, w),
      "ordinal" = ItemEntropy.ordinal(U, na, Z, w),
      "nominal" = response_type_error(U$response.type, "ItemEntropy")
    )
  } else {
    U <- dataFormat(U, na = na, Z = Z, w = w)
    ItemEntropy(U)
  }
}

#' @rdname ItemEntropy
#' @export
ItemEntropy.binary <- function(U, na = NULL, Z = NULL, w = NULL) {
  # Calculate correct response rates
  p <- crr(U)
  # Calculate entropy in bits
  # Using log base 2 for information content in bits
  itemE <- -p * log(p, base = 2) - (1 - p) * log(1 - p, base = 2)
  return(itemE)
}

#' @rdname ItemEntropy
#' @export
ItemEntropy.ordinal <- function(U, na = NULL, Z = NULL, w = NULL) {
  nitems <- ncol(U$Q)
  ncat <- U$categories
  U$Q[U$Z == 0] <- NA
  entropy <- rep(0, nitems)
  for (j in 1:nitems) {
    vec <- tabulate(U$Q[, j]) / sum(tabulate(U$Q[, j]))
    entropy[j] <- sum(vec * log(vec, base = ncat[j])) * -1
  }
  return(entropy)
}



#' @title Item-Total Correlation
#' @description
#' Item-Total correlation (ITC) is a Pearson's correlation of an item with
#' the Number-Right Score (NRS) or total score. This function is applicable
#' only to binary response data.
#'
#' The ITC is a measure of item discrimination, indicating how well an item
#' distinguishes between high and low performing examinees.
#' @details
#' The correlation is calculated between:
#' \itemize{
#'   \item Each item's responses (0 or 1)
#'   \item The total test score (sum of correct responses)
#' }
#' Higher positive correlations indicate items that better discriminate between
#' high and low ability examinees.
#' @param U Either an object of class "exametrika" or raw data. When raw data is given,
#'   it is converted to the exametrika class with the \code{\link{dataFormat}} function.
#' @param Z Missing indicator matrix of type matrix or data.frame. Values of 1 indicate
#'   observed responses, while 0 indicates missing data.
#' @param w Item weight vector specifying the relative importance of each item.
#' @param na Values to be treated as missing values.
#' @note This function is implemented using a binary data compatibility wrapper and
#'       will raise an error if used with polytomous data.
#' @return A numeric vector of item-total correlations. Values typically range
#' from -1 to 1, where:
#' \itemize{
#'   \item Values near 1: Strong positive discrimination
#'   \item Values near 0: No discrimination
#'   \item Negative values: Potential item problems (lower ability students
#'         performing better than higher ability students)
#' }
#' @note
#' Values below 0.2 might indicate problematic items that should be reviewed.
#' Values above 0.3 are generally considered acceptable.
#' @examples
#' # using sample dataset
#' ItemTotalCorr(J15S500)
#'
#' @export
ItemTotalCorr <- function(U, na = NULL, Z = NULL, w = NULL) {
  UseMethod("ItemTotalCorr")
}

#' @rdname ItemTotalCorr
#' @export
ItemTotalCorr.default <- function(U, na = NULL, Z = NULL, w = NULL) {
  if (inherits(U, "exametrika")) {
    switch(U$response.type,
      "binary" = ItemTotalCorr.binary(U, na, Z, w),
      "rated" = ItemTotalCorr.binary(U, na, Z, w),
      "ordinal" = ItemTotalCorr.ordinal(U, na, Z, w),
      "nominal" = response_type_error(U$response.type, "ItemTotalCorr")
    )
  } else {
    U <- dataFormat(U, na = na, Z = Z, w = w)
    ItemTotalCorr(U)
  }
}

#' @rdname ItemTotalCorr
#' @export
ItemTotalCorr.binary <- function(U, na = NULL, Z = NULL, w = NULL) {
  # Calculate item probabilities
  p <- crr(U)
  # Calculate total scores
  Zeta <- sscore(U)
  # Create probability matrix (repeating p for each student)
  TBL <- matrix(rep(p, each = NROW(U$U)),
    nrow = NROW(U$U),
    byrow = FALSE
  )
  # Handle missing values in response matrix
  Una <- ifelse(is.na(U$U), 0, U$U)
  # Calculate deviations from expected values
  dev <- U$Z * (Una - TBL)
  # Calculate item variances
  V <- colSums(dev^2) / (colSums(U$Z) - 1)
  SD <- sqrt(V)
  # Calculate correlations
  rho_Zi <- t(dev) %*% Zeta / SD / colSums(U$Z)
  return(rho_Zi)
}

#' @rdname ItemTotalCorr
#' @export
ItemTotalCorr.ordinal <- function(U, na = NULL, Z = NULL, w = NULL) {
  total <- rowSums(U$Q)
  nitems <- NCOL(U$Z)
  rho_Zi <- numeric(nitems)
  for (j in 1:nitems) {
    rho_Zi[j] <- polyserial(total, U$Q[, j])
  }
  return(rho_Zi)
}


#' @title Biserial Correlation
#' @description
#' A biserial correlation is a correlation between dichotomous-ordinal and
#' continuous variables.
#' @param i i is a dichotomous-ordinal variable (0/1). x and y can also be the other way around.
#' @param t t is a continuous variable. x and y can also be the other way around.
#' @return The biserial correlation coefficient between the two variables.
#' @importFrom stats na.omit qnorm pnorm optim
#' @export
BiserialCorrelation <- function(i, t) {
  # Original function remains unchanged
  # Count unique values
  unique_i <- length(unique(na.omit(i)))
  unique_t <- length(unique(na.omit(t)))
  # Check if one is binary and the other is continuous
  if (!((unique_i == 2 && unique_t > 2) | (unique_t == 2 && unique_i > 2))) {
    stop("One argument must be binary and the other must be continuous.")
  }
  ## if switched...
  if (unique_i > 2) {
    tmp <- i
    i <- t
    t <- tmp
  }
  ## calcs correlation
  tau_j <- qnorm(1 - mean(i, na.rm = TRUE))
  ll <- function(rho, tau_j, i, t) {
    tmp <- (1 - i) %*% (log(pnorm(tau_j, mean = rho * t, sd = sqrt(1 - rho^2)))) +
      i %*% (log(1 - pnorm(tau_j, mean = rho * t, sd = sqrt(1 - rho^2))))
  }
  pairwise <- !is.na(i + t)
  ret <- optim(
    par = 0, # initial value
    fn = function(x) {
      -ll(rho = x, tau_j, i[pairwise], t[pairwise])
    },
    lower = -1, # lower limit
    upper = 1, # upper limit
    method = "Brent" # one-dimensional optimization method
  )
  return(ret$par)
}

#' @title Item-Total Biserial Correlation
#' @description
#' The Item-Total Biserial Correlation computes the biserial correlation
#' between each item and the total score. This function is applicable only
#' to binary response data.
#'
#' This correlation provides a measure of item discrimination, indicating how well
#' each item distinguishes between high and low performing examinees.
#' @param U Either an object of class "exametrika" or raw data. When raw data is given,
#'   it is converted to the exametrika class with the \code{\link{dataFormat}} function.
#' @param Z Missing indicator matrix of type matrix or data.frame. Values of 1 indicate
#'   observed responses, while 0 indicates missing data.
#' @param w Item weight vector specifying the relative importance of each item.
#' @param na Values to be treated as missing values.
#' @note This function is implemented using a binary data compatibility wrapper and
#'       will raise an error if used with polytomous data.
#' @return A numeric vector of item-total biserial correlations. Values range
#' from -1 to 1, where:
#' \itemize{
#'   \item Values near 1: Strong positive discrimination
#'   \item Values near 0: No discrimination
#'   \item Negative values: Potential item problems
#' }
#' @note
#' The biserial correlation is generally preferred over the point-biserial
#' correlation when the dichotomization is artificial (i.e., when the underlying
#' trait is continuous).
#' @examples
#' # using sample dataset
#' ITBiserial(J15S500)
#' @export
ITBiserial <- function(U, na = NULL, Z = NULL, w = NULL) {
  UseMethod("ITBiserial")
}

#' @rdname ITBiserial
#' @export
ITBiserial.default <- function(U, na = NULL, Z = NULL, w = NULL) {
  if (!inherits(U, "exametrika")) {
    tmp <- dataFormat(U, na = na, Z = Z, w = w)
  }
  if (U$response.type != "binary") {
    response_type_error(U$response.type, "ITBiserial")
  }
  ITBiserial.binary(U, na, Z, w)
}

#' @rdname ITBiserial
#' @export
ITBiserial.binary <- function(U, na = NULL, Z = NULL, w = NULL) {
  # Calculate total scores
  Zeta <- sscore(U)

  # Handle missing values
  U_data <- U$U
  U_data[U$Z == 0] <- NA

  # Calculate biserial correlation for each item
  ITB <- vapply(seq_len(ncol(U_data)), function(i) {
    BiserialCorrelation(U_data[, i], Zeta)
  }, numeric(1))

  return(ITB)
}


#' @title Number Right Score
#' @description
#' The Number-Right Score (NRS) function calculates the weighted sum of correct
#' responses for each examinee. This function is applicable only to binary
#' response data.
#'
#' For each examinee, the score is computed as:
#' \deqn{NRS_i = \sum_{j=1}^J z_{ij}u_{ij}w_j}
#' where:
#' \itemize{
#'   \item \eqn{z_{ij}} is the missing response indicator (0/1)
#'   \item \eqn{u_{ij}} is the response (0/1)
#'   \item \eqn{w_j} is the item weight
#' }
#' @param U Either an object of class "exametrika" or raw data. When raw data is given,
#'   it is converted to the exametrika class with the \code{\link{dataFormat}} function.
#' @param Z Missing indicator matrix of type matrix or data.frame. Values of 1 indicate
#'   observed responses, while 0 indicates missing data.
#' @param w Item weight vector specifying the relative importance of each item.
#' @param na Values to be treated as missing values.
#' @note This function is implemented using a binary data compatibility wrapper and
#'       will raise an error if used with polytomous data.
#' @return A numeric vector containing the Number-Right Score for each examinee.
#' The score represents the weighted sum of correct answers, where:
#' \itemize{
#'   \item Maximum score is the sum of all item weights
#'   \item Minimum score is 0
#'   \item Missing responses do not contribute to the score
#' }
#' @examples
#' # using sample dataset
#' nrs(J15S500)
#' @export
nrs <- function(U, na = NULL, Z = NULL, w = NULL) {
  UseMethod("nrs")
}

#' @rdname nrs
#' @export
nrs.default <- function(U, na = NULL, Z = NULL, w = NULL) {
  if (!inherits(U, "exametrika")) {
    U <- dataFormat(U, na = na, Z = Z, w = w)
  }
  if (U$response.type != "binary") {
    response_type_error(U$response.type, "nrs")
  }
  nrs.binary(U, na, Z, w)
}

#' @rdname nrs
#' @export
nrs.binary <- function(U, na = NULL, Z = NULL, w = NULL) {
  # Calculate weighted sum of correct responses
  # Z * U gives correct answers accounting for missing data
  # Multiply by weights and sum across items
  total_weighted_score <- (U$Z * U$U) %*% U$w
  return(total_weighted_score)
}

#' @title Passage Rate of Student
#' @description
#' The Passage Rate for each student is calculated as their Number-Right Score (NRS)
#' divided by the number of items presented to them. This function is applicable
#' only to binary response data.
#'
#' The passage rate is calculated as:
#' \deqn{P_i = \frac{\sum_{j=1}^J z_{ij}u_{ij}w_j}{\sum_{j=1}^J z_{ij}}}
#' where:
#' \itemize{
#'   \item \eqn{z_{ij}} is the missing response indicator (0/1)
#'   \item \eqn{u_{ij}} is the response (0/1)
#'   \item \eqn{w_j} is the item weight
#' }
#' @param U Either an object of class "exametrika" or raw data. When raw data is given,
#'   it is converted to the exametrika class with the \code{\link{dataFormat}} function.
#' @param Z Missing indicator matrix of type matrix or data.frame. Values of 1 indicate
#'   observed responses, while 0 indicates missing data.
#' @param w Item weight vector specifying the relative importance of each item.
#' @param na Values to be treated as missing values.
#' @note This function is implemented using a binary data compatibility wrapper and
#'       will raise an error if used with polytomous data.
#' @return A numeric vector containing the passage rate for each student.
#' Values range from 0 to 1 (or maximum weight) where:
#' \itemize{
#'   \item 1: Perfect score on all attempted items
#'   \item 0: No correct answers
#'   \item NA: No items attempted
#' }
#' @note
#' The passage rate accounts for missing responses by only considering items that
#' were actually presented to each student. This provides a fair comparison
#' between students who attempted different numbers of items.
#' @examples
#' # using sample dataset
#' passage(J15S500)
#'
#' @export
passage <- function(U, na = NULL, Z = NULL, w = NULL) {
  UseMethod("passage")
}

#' @rdname passage
#' @export
passage.default <- function(U, na = NULL, Z = NULL, w = NULL) {
  if (!inherits(U, "exametrika")) {
    U <- dataFormat(U, na = na, Z = Z, w = w)
  }
  if (U$response.type != "binary") {
    response_type_error(U$response.type, "passage")
  }
  passage.binary(U, na, Z, w)
}

#' @rdname passage
#' @export
passage.binary <- function(U, na = NULL, Z = NULL, w = NULL) {
  # Calculate Number-Right Score
  total_score <- nrs(U)

  # Calculate number of items presented to each student
  items_attempted <- NCOL(U$U) - rowSums(1 - U$Z)

  # Calculate passage rate
  # Divide total score by number of items attempted
  passage_rate <- total_score / items_attempted

  return(passage_rate)
}

#' @title Standardized Score
#' @description
#' The standardized score (z-score) indicates how far a student's performance
#' deviates from the mean in units of standard deviation. This function is
#' applicable only to binary response data.
#'
#' The score is calculated by standardizing the passage rates:
#' \deqn{Z_i = \frac{r_i - \bar{r}}{\sigma_r}}
#' where:
#' \itemize{
#'   \item \eqn{r_i} is student i's passage rate
#'   \item \eqn{\bar{r}} is the mean passage rate
#'   \item \eqn{\sigma_r} is the standard deviation of passage rates
#' }
#' @param U Either an object of class "exametrika" or raw data. When raw data is given,
#'   it is converted to the exametrika class with the \code{\link{dataFormat}} function.
#' @param Z Missing indicator matrix of type matrix or data.frame. Values of 1 indicate
#'   observed responses, while 0 indicates missing data.
#' @param w Item weight vector specifying the relative importance of each item.
#' @param na Values to be treated as missing values.
#' @note This function is implemented using a binary data compatibility wrapper and
#'       will raise an error if used with polytomous data.
#' @return A numeric vector of standardized scores for each student. The scores follow
#' a standard normal distribution with:
#' \itemize{
#'   \item Mean = 0
#'   \item Standard deviation = 1
#'   \item Approximately 68% of scores between -1 and 1
#'   \item Approximately 95% of scores between -2 and 2
#'   \item Approximately 99% of scores between -3 and 3
#' }
#' @note
#' The standardization allows for comparing student performance across different
#' tests or groups. A positive score indicates above-average performance, while
#' a negative score indicates below-average performance.
#' @examples
#' # using sample dataset
#' sscore(J5S10)
#' @export
sscore <- function(U, na = NULL, Z = NULL, w = NULL) {
  UseMethod("sscore")
}

#' @rdname sscore
#' @export
sscore.default <- function(U, na = NULL, Z = NULL, w = NULL) {
  if (!inherits(U, "exametrika")) {
    U <- dataFormat(U, na = na, Z = Z, w = w)
  }
  if (U$response.type != "binary") {
    response_type_error(U$response.type, "sscore")
  }
  sscore.binary(U, na, Z, w)
}

#' @rdname sscore
#' @export
sscore.binary <- function(U, na = NULL, Z = NULL, w = NULL) {
  # Get number of students
  S <- nrow(U$U)

  # Create unit vector
  OneS <- rep(1, length = S)

  # Calculate passage rates
  passage_rates <- passage(U)

  # Calculate mean passage rate
  mean_rate <- mean(passage_rates, na.rm = TRUE)

  # Calculate variance of passage rates
  centered_rates <- passage_rates - mean_rate
  var_rates <- sum(centered_rates^2, na.rm = TRUE) / (S - 1)

  # Calculate standardized scores
  z_scores <- centered_rates / sqrt(var_rates)

  return(z_scores)
}


#' @title Student Percentile Ranks
#' @description
#' The percentile function calculates each student's relative standing in the group,
#' expressed as a percentile rank (1-100). This function is applicable only to
#' binary response data.
#'
#' The percentile rank indicates the percentage of scores in the distribution
#' that fall below a given score. For example, a percentile rank of 75 means
#' the student performed better than 75% of the group.
#' @param U Either an object of class "exametrika" or raw data. When raw data is given,
#'   it is converted to the exametrika class with the \code{\link{dataFormat}} function.
#' @param Z Missing indicator matrix of type matrix or data.frame. Values of 1 indicate
#'   observed responses, while 0 indicates missing data.
#' @param w Item weight vector specifying the relative importance of each item.
#' @param na Values to be treated as missing values.
#' @note This function is implemented using a binary data compatibility wrapper and
#'       will raise an error if used with polytomous data.
#' @return A numeric vector of percentile ranks (1-100) for each student, where:
#' \itemize{
#'   \item 100: Highest performing student(s)
#'   \item 50: Median performance
#'   \item 1: Lowest performing student(s)
#' }
#' @note
#' Percentile ranks are calculated using the empirical cumulative distribution
#' function of standardized scores. Tied scores receive the same percentile rank.
#' The values are rounded up to the nearest integer to provide ranks from 1 to 100.
#' @examples
#' # using sample dataset
#' percentile(J5S10)
#'
#' @importFrom stats ecdf
#' @export
percentile <- function(U, na = NULL, Z = NULL, w = NULL) {
  UseMethod("percentile")
}

#' @rdname percentile
#' @export
percentile.default <- function(U, na = NULL, Z = NULL, w = NULL) {
  if (!inherits(U, "exametrika")) {
    U <- dataFormat(U, na = na, Z = Z, w = w)
  }
  if (U$response.type != "binary") {
    response_type_error(U$response.type, "percentile")
  }
  percentile.binary(U, na, Z, w)
}

#' @rdname percentile
#' @export
percentile.binary <- function(U, na = NULL, Z = NULL, w = NULL) {
  # Calculate standardized scores
  standardized_scores <- sscore(U)

  # Calculate empirical cumulative distribution function
  empirical_dist <- ecdf(standardized_scores)

  # Convert to percentiles (1-100)
  # Ceiling function ensures minimum of 1 and handles rounding
  percentile_ranks <- ceiling(empirical_dist(standardized_scores) * 100)

  return(percentile_ranks)
}

#' @title Stanine Scores
#' @description
#' The Stanine (Standard Nine) scoring system divides students into nine groups
#' based on a normalized distribution. This function is applicable only to
#' binary response data.
#'
#' These groups correspond to the following percentile ranges:
#' \itemize{
#'   \item Stanine 1: lowest 4% (percentiles 1-4)
#'   \item Stanine 2: next 7% (percentiles 5-11)
#'   \item Stanine 3: next 12% (percentiles 12-23)
#'   \item Stanine 4: next 17% (percentiles 24-40)
#'   \item Stanine 5: middle 20% (percentiles 41-60)
#'   \item Stanine 6: next 17% (percentiles 61-77)
#'   \item Stanine 7: next 12% (percentiles 78-89)
#'   \item Stanine 8: next 7% (percentiles 90-96)
#'   \item Stanine 9: highest 4% (percentiles 97-100)
#' }
#' @param U Either an object of class "exametrika" or raw data. When raw data is given,
#'   it is converted to the exametrika class with the \code{\link{dataFormat}} function.
#' @param Z Missing indicator matrix of type matrix or data.frame. Values of 1 indicate
#'   observed responses, while 0 indicates missing data.
#' @param w Item weight vector specifying the relative importance of each item.
#' @param na Values to be treated as missing values.
#' @note This function is implemented using a binary data compatibility wrapper and
#'       will raise an error if used with polytomous data.
#' @return A list containing two elements:
#' \describe{
#'   \item{stanine}{The score boundaries for each stanine level}
#'   \item{stanineScore}{The stanine score (1-9) for each student}
#' }
#' @note
#' Stanine scores provide a normalized scale with:
#' \itemize{
#'   \item Mean = 5
#'   \item Standard deviation = 2
#'   \item Scores range from 1 to 9
#'   \item Score of 5 represents average performance
#' }
#' @references
#' Angoff, W. H. (1984). Scales, norms, and equivalent scores. Educational Testing Service.
#' (Reprint of chapter in R. L. Thorndike (Ed.) (1971) Educational Measurement (2nd Ed.).
#' American Council on Education.
#' @importFrom stats quantile
#' @examples
#' result <- stanine(J15S500)
#' # View score boundaries
#' result$stanine
#' # View individual scores
#' result$stanineScore
#'
#' @export
stanine <- function(U, na = NULL, Z = NULL, w = NULL) {
  UseMethod("stanine")
}

#' @rdname stanine
#' @export
stanine.default <- function(U, na = NULL, Z = NULL, w = NULL) {
  if (!inherits(U, "exametrika")) {
    U <- dataFormat(U, na = na, Z = Z, w = w)
  }
  if (U$response.type != "binary") {
    response_type_error(U$response.type, "stanine")
  }
  stanine.binary(U, na, Z, w)
}

#' @rdname stanine
#' @export
stanine.binary <- function(U, na = NULL, Z = NULL, w = NULL) {
  # Define stanine boundaries (cumulative proportions)
  stanine_bounds <- cumsum(c(0.04, 0.07, 0.12, 0.17, 0.20, 0.17, 0.12, 0.07))

  # Calculate raw scores
  raw_scores <- nrs(U)

  # Calculate score boundaries using raw scores
  stanine_boundaries <- quantile(raw_scores,
    probs = stanine_bounds,
    na.rm = TRUE
  )

  # Calculate percentile scores
  percentile_scores <- percentile(U)

  # Calculate stanine boundaries using percentile scores
  stanine_percentile_bounds <- quantile(percentile_scores,
    probs = stanine_bounds
  )
  stanine_percentile_bounds <- unique(stanine_percentile_bounds)
  if (length(stanine_percentile_bounds) != 8) {
    warning(
      "Standard stanine (9-level) division not possible due to data distribution.\n",
      "Merging duplicate boundaries for adjustment."
    )
  }
  # Assign stanine scores
  stanine_scores <- cut(percentile_scores,
    breaks = c(-Inf, stanine_percentile_bounds, Inf),
    right = FALSE,
    labels = 1:(length(stanine_percentile_bounds) + 1)
  )

  # Return results
  list(
    stanine = stanine_boundaries,
    stanineScore = stanine_scores
  )
}

#' @title Dimensionality
#' @description
#' The dimensionality is the number of components
#' the test is measuring.
#' @param U Either an object of class "exametrika" or raw data. When raw data is given,
#'   it is converted to the exametrika class with the \code{\link{dataFormat}} function.
#' @param Z Missing indicator matrix of type matrix or data.frame. Values of 1 indicate
#'   observed responses, while 0 indicates missing data.
#' @param w Item weight vector specifying the relative importance of each item.
#' @param na Values to be treated as missing values.
#' @return Returns a list of class c("exametrika", "Dimensionality") containing:
#'   \describe{
#'     \item{Component}{Sequence of component numbers}
#'     \item{Eigenvalue}{Eigenvalues of the tetrachoric correlation matrix}
#'     \item{PerOfVar}{Percentage of variance explained by each component}
#'     \item{CumOfPer}{Cumulative percentage of variance explained}
#'   }
#'
#' @export
Dimensionality <- function(U, na = NULL, Z = NULL, w = NULL) {
  UseMethod("Dimensionality")
}

#' @rdname Dimensionality
#' @export
Dimensionality.default <- function(U, na = NULL, Z = NULL, w = NULL) {
  if (inherits(U, "exametrika")) {
    switch(U$response.type,
      "binary" = Dimensionality.binary(U, na = NULL, Z = NULL, w = NULL),
      "rated" = Dimensionality.rated(U, na = NULL, Z = NULL, w = NULL),
      "ordinal" = Dimensionality.ordinal(U, na = NULL, Z = NULL, w = NULL),
      "nominal" = response_type_error(U$response.type, "Dimensionality")
    )
  } else {
    U <- dataFormat(U, na = na, Z = Z, w = w)
    Dimensionality(U)
  }
}

#' @rdname Dimensionality
#' @export
Dimensionality.binary <- function(U, na = NULL, Z = NULL, w = NULL) {
  tmp <- dataFormat(data = U, na = na, Z = Z, w = w)
  R <- TetrachoricCorrelationMatrix(tmp)
  Esystem <- eigen(R)
  Eval <- Esystem$values
  EvalVariance <- Esystem$values / length(Eval) * 100
  CumVari <- cumsum(EvalVariance)
  ret <-
    structure(
      list(
        Component = seq(1:length(Eval)),
        Eigenvalue = Eval,
        PerOfVar = EvalVariance,
        CumOfPer = CumVari
      ),
      class = c("exametrika", "Dimensionality")
    )

  return(ret)
}

#' @rdname Dimensionality
#' @export
Dimensionality.rated <- function(U, na = NULL, Z = NULL, w = NULL) {
  tmp <- dataFormat(data = U, na = na, Z = Z, w = w)
  R <- TetrachoricCorrelationMatrix(tmp$U)
  Esystem <- eigen(R)
  Eval <- Esystem$values
  EvalVariance <- Esystem$values / length(Eval) * 100
  CumVari <- cumsum(EvalVariance)
  ret <-
    structure(
      list(
        Component = seq(1:length(Eval)),
        Eigenvalue = Eval,
        PerOfVar = EvalVariance,
        CumOfPer = CumVari
      ),
      class = c("exametrika", "Dimensionality")
    )

  return(ret)
}

#' @rdname Dimensionality
#' @export
Dimensionality.ordinal <- function(U, na = NULL, Z = NULL, w = NULL) {
  tmp <- dataFormat(data = U, na = na, Z = Z, w = w)
  R <- PolychoricCorrelationMatrix(tmp)
  Esystem <- eigen(R)
  Eval <- Esystem$values
  EvalVariance <- Esystem$values / length(Eval) * 100
  CumVari <- cumsum(EvalVariance)
  ret <-
    structure(
      list(
        Component = seq(1:length(Eval)),
        Eigenvalue = Eval,
        PerOfVar = EvalVariance,
        CumOfPer = CumVari
      ),
      class = c("exametrika", "Dimensionality")
    )

  return(ret)
}

Try the exametrika package in your browser

Any scripts or data that you put into this service are public.

exametrika documentation built on Aug. 21, 2025, 5:27 p.m.