Nothing
#' @title Infinite Relational Model
#' @description
#' The purpose of this method is to find
#' the optimal number of classes C, and optimal number of
#' fields F. It can be found in a single run of the analysis, but
#' it takes a long computation time when the sample size S is large.
#' In addition, this method incorporates the Chinese restaurant process
#' and Gibbs sampling. In detail, See Section 7.8 in Shojima(2022).
#' @param U U is either a data class of exametrika, or raw data. When raw data is given,
#' it is converted to the exametrika class with the [dataFormat] function.
#' @param Z Z is a missing indicator matrix of the type matrix or data.frame
#' @param w w is item weight vector
#' @param na na argument specifies the numbers or characters to be treated as missing values.
#' @param gamma_c \eqn{\gamma_C} is the hyperparameter of the CRP and represents the
#' attractiveness of a new Class. As \eqn{\gamma_C} increases, the student is more likely
#' to be seated at a vacant class. The default is 1.
#' @param gamma_f \eqn{\gamma_F} is the hyperparameter of the CRP and represents the
#' attractiveness of a new Field. The greater this value it more likely to be classified
#' in the new field. The default is 1.
#' @param max_iter A maximum iteration number of IRM process. The default is 100.
#' @param stable_limit The IRM process exits the loop when the FRM stabilizes and no longer
#' changes significantly. This option sets the maximum number of stable iterations,
#' with a default of 5.
#' @param minSize A value used for readjusting the number of classes.If the size of each
#' class is less than \code{minSize}, the number of classes will be reduced. Note that this
#' under limit of size is not used for either all correct or all incorrect class.
#' @param EM_limit After IRM process, resizing the number of classes process will starts.
#' This process using EM algorithm,\code{EM_limit} is the maximum number of iteration with
#' default of 20.
#' @param seed seed value for random numbers.
#' @param verbose verbose output Flag. default is TRUE
#' @return
#' \describe{
#' \item{nobs}{Sample size. The number of rows in the dataset.}
#' \item{msg}{A character string indicating the model type. }
#' \item{testlength}{Length of the test. The number of items included in the test.}
#' \item{Nclass}{Optimal number of classes.}
#' \item{Nfield}{Optimal number of fields.}
#' \item{BRM}{Bicluster Reference Matrix}
#' \item{FRP}{Field Reference Profile}
#' \item{FRPIndex}{Index of FFP includes the item location parameters B and Beta,
#' the slope parameters A and Alpha, and the monotonicity indices C and Gamma.}
#' \item{TRP}{Test Reference Profile}
#' \item{FMP}{Field Membership Profile}
#' \item{Students}{Rank Membership Profile matrix.The s-th row vector of \eqn{\hat{M}_R}, \eqn{\hat{m}_R}, is the
#' rank membership profile of Student s, namely the posterior probability distribution representing the student's
#' belonging to the respective latent classes. It also includes the rank with the maximum estimated membership probability,
#' as well as the rank-up odds and rank-down odds.}
#' \item{LRD}{Latent Rank Distribution. see also [plot.exametrika]}
#' \item{LFD}{Latent Field Distribution. see also [plot.exametrika]}
#' \item{RMD}{Rank Membership Distribution.}
#' \item{TestFitIndices}{Overall fit index for the test.See also [TestFit]}
#' }
#' @importFrom stats rmultinom
#' @examples
#' \donttest{
#' # Fit an Infinite Relational Model (IRM) to determine optimal number of classes and fields
#' # gamma_c and gamma_f are concentration parameters for the Chinese Restaurant Process
#' result.IRM <- IRM(J35S515, gamma_c = 1, gamma_f = 1, verbose = TRUE)
#'
#' # Display the Bicluster Reference Matrix (BRM) as a heatmap
#' # Shows the discovered clustering structure of items and students
#' plot(result.IRM, type = "Array")
#'
#' # Plot Field Reference Profiles (FRP) in a 3-column grid
#' # Shows the probability patterns for each automatically determined field
#' plot(result.IRM, type = "FRP", nc = 3)
#'
#' # Plot Test Reference Profile (TRP)
#' # Shows the overall response pattern across all fields
#' plot(result.IRM, type = "TRP")
#' }
#'
#' @export
IRM <- function(U, Z = NULL, w = NULL, na = NULL,
gamma_c = 1, gamma_f = 1,
max_iter = 100, stable_limit = 5, minSize = 20, EM_limit = 20,
seed = 123, verbose = TRUE) {
# data format
if (!inherits(U, "exametrika")) {
tmp <- dataFormat(data = U, na = na, Z = Z, w = w)
} else {
tmp <- U
}
U <- tmp$U * tmp$Z
testlength <- NCOL(tmp$U)
nobs <- NROW(tmp$U)
const <- 1e-100
gamp <- 1
# Initialize
set.seed(seed)
limit_count <- 0
iter <- 1
pattern <- sort(unique(nrs(tmp)))
zeroG <- ifelse(pattern[1] == 0, 1, 0)
fullG <- ifelse(pattern[length(pattern)] == testlength, 1, 0)
## Initial Class
ncls <- length(pattern)
cls01 <- matrix(0, ncol = length(pattern), nrow = nobs)
for (i in 1:nobs) {
cls01[i, which((pattern == nrs(tmp)[i]))] <- 1
}
colnames(cls01) <- paste("Class", 1:ncls)
rownames(cls01) <- tmp$ID
cls <- cls01 %*% (1:ncls)
Nc <- colSums(cls01)
## Initial Field
nfld <- testlength
field <- 1:testlength
crr <- crr(tmp)
df_tmp <- data.frame(field, crr)
df_tmp <- df_tmp[order(crr, decreasing = T), ]
sorted_list <- df_tmp$field
field <- order(sorted_list)
fld01 <- matrix(0, nrow = testlength, ncol = nfld)
for (i in 1:testlength) {
fld01[i, field[i]] <- 1
}
colnames(fld01) <- paste("Field", 1:nfld)
rownames(fld01) <- tmp$ItemLabel
Nf <- colSums(fld01)
## Inner Function for Ccf/Fcf/Ncf/Pcf
Ccf_Fcf_Pcf <- function(U, Z, cls01, fld01, gamp, zeroG, fullG, type = 1) {
Ccf <- t(cls01) %*% tmp$U %*% fld01
Fcf <- t(cls01) %*% (tmp$Z * (1 - tmp$U)) %*% fld01
Ncf <- Ccf + Fcf
Pcf <- (Ccf + gamp - 1) / (Ncf + 2 * gamp - (type * 2))
if (zeroG == 1) {
Pcf[1, ] <- 0
}
if (fullG == 1) {
Pcf[nrow(Pcf), ] <- 1
}
return(list(Ccf = Ccf, Fcf = Fcf, Ncf = Ncf, Pcf = Pcf))
}
## Inner Function for Sorting PCF
Pcf_sort <- function(Pcf, cls01, fld01, ncls, nfld) {
sort_list <- order(colSums(Pcf), decreasing = TRUE)
Pcf <- Pcf[, sort_list]
fld01 <- fld01[, sort_list]
colnames(fld01) <- paste("Field", 1:nfld)
field <- t((1:nfld) %*% t(fld01))
sort_list2 <- order(rowSums(Pcf), decreasing = FALSE)
Pcf <- Pcf[sort_list2, ]
cls01 <- cls01[, sort_list2]
colnames(cls01) <- paste("Class", 1:ncls)
cls <- cls01 %*% (1:ncls)
return(list(Pcf = Pcf, cls01 = cls01, fld01 = fld01, cls = cls, field = field))
}
### Initial value for parameter set and the model fit
ret <- Ccf_Fcf_Pcf(tmp$U, tmp$Z, cls01, fld01, gamp, zeroG, fullG, type = 1)
Ccf <- ret$Ccf
Fcf <- ret$Fcf
Ncf <- ret$Ncf
Pcf <- ret$Pcf
# IRM Iteration ------------------------------------------------------
IRM_FLG <- TRUE
while (IRM_FLG) {
### Gibbs sampler i
Cif <- tmp$U %*% fld01 # SxF. number of correct response Studet S in Field f
Fif <- (tmp$Z * (1 - tmp$U)) %*% fld01 # SxF. number of incorrect response Studet S in Field f
iRand <- sample(1:nobs, nobs, replace = F)
for (loop in 1:nobs) {
target <- iRand[loop]
if (cls[target] >= 2 && cls[target] <= ncls - 1) {
# delete selected member
Nc[cls[target]] <- Nc[cls[target]] - 1
Ncf[cls[target], ] <- Ncf[cls[target], ] - Nf
Ccf[cls[target], ] <- Ccf[cls[target], ] - Cif[target, ]
Fcf[cls[target], ] <- Fcf[cls[target], ] - Fif[target, ]
CcfPlus <- Ccf + matrix(rep(Cif[target, ], ncls), nrow = ncls, byrow = T)
FcfPlus <- Fcf + matrix(rep(Fif[target, ], ncls), nrow = ncls, byrow = T)
if (Nc[cls[target]] == 0) {
## if the class disappeared...
ncls <- ncls - 1
delCls <- cls[target]
cls01 <- cls01[, -delCls]
cls <- cls01 %*% (1:ncls)
Nc <- Nc[-delCls]
Ncf <- Ncf[-delCls, ]
Ccf <- Ccf[-delCls, ]
Fcf <- Fcf[-delCls, ]
CcfPlus <- CcfPlus[-delCls, ]
FcfPlus <- FcfPlus[-delCls, ]
}
# Table Choice!
## Likelihood for Existing Table
Nc_tmp <- Nc[-1]
Nc_tmp <- Nc_tmp[-(length(Nc_tmp))]
exist_tab_tmp <- log((Nc_tmp / (nobs - 1 + gamma_c)) + const)
vec1 <- numeric(ncls - 2)
vec2 <- numeric(ncls - 2)
vec3 <- numeric(ncls - 2)
for (i in 2:(ncls - 1)) {
Ctmp <- 0
Ftmp <- 0
den <- 0
for (j in 1:nfld) {
ct1 <- (CcfPlus[i, j] - Ccf[i, j] - 1)
if (ct1 >= 0) {
for (s in 0:ct1) {
Ctmp <- Ctmp + log(CcfPlus[i, j] + gamp - 1 - s + const)
}
}
ct2 <- (FcfPlus[i, j] - Fcf[i, j] - 1)
if (ct2 >= 0) {
for (s in 0:ct2) {
Ftmp <- Ftmp + log(FcfPlus[i, j] + gamp - 1 - s + const)
}
}
ct3 <- CcfPlus[i, j] + FcfPlus[i, j] - Ccf[i, j] - Fcf[i, j] - 1
if (ct3 >= 0) {
for (s in 0:ct3) {
den <- den +
(log(CcfPlus[i, j] + FcfPlus[i, j] + 2 * gamp - 1 - s + const))
}
}
}
vec1[i - 1] <- Ctmp
vec2[i - 1] <- Ftmp
vec3[i - 1] <- den
}
exist_tab <- exist_tab_tmp + vec1 + vec2 - vec3
## Likelihood for New Table
new_tab_tmp <- log((gamma_c / (nobs - 1 + gamma_c)) + const)
num <- 0
den <- 0
for (j in 1:nfld) {
maxS <- Cif[target, j] - 1
if (maxS >= 0) {
for (s in 0:maxS) {
num <- num + log(Cif[target, j] + gamp - 1 - s + const)
}
}
maxS <- Nf[j] - Cif[target, j] - 1
if (maxS >= 0) {
for (s in 0:maxS) {
num <- num + log(Nf[j] - Cif[target, j] + gamp - 1 - s + const)
}
}
maxS <- Nf[j] - 1
if (maxS >= 0) {
for (s in 0:maxS) {
den <- den + log(Nf[j] + 2 * gamp - 1 - s + const)
}
}
}
new_tab <- num - den + new_tab_tmp
# Likelihood to Probability
ptab <- c(exist_tab, new_tab)
mintab <- min(ptab)
exptab <- exp(ptab - mintab)
ptab <- exptab / sum(exptab)
ptab <- round(ptab + const, digits = -log10(const))
sampled_value <- rmultinom(1, 1, ptab)
newclass01 <- c(0, sampled_value, 0)
delpos <- length(newclass01) - 1
if (newclass01[delpos] == 0) {
# Student s belongs an other class
cls01[target, ] <- newclass01[-delpos]
cls[target] <- which.max(cls01[target, ])
} else {
# Student s belongs to a new class
ncls <- ncls + 1
new_class <- which.max(newclass01)
cls01_left <- cls01[, 1:(ncls - 2)]
cls01_right <- cls01[, (ncls - 1)]
zeros <- rep(0, nrow(cls01))
cls01 <- cbind(cls01_left, zeros, cls01_right)
cls01[target, ] <- newclass01
cls <- cls01 %*% (1:ncls)
Ccf <- t(cls01) %*% tmp$U %*% fld01
Fcf <- t(cls01) %*% (tmp$Z * (1 - tmp$U)) %*% fld01
Ncf <- Ccf + Fcf
}
# Update Number of Correct/Incorrect response
Nc <- colSums(cls01)
Ccf[cls[target], ] <- Ccf[cls[target], ] + Cif[target, ]
Fcf[cls[target], ] <- Fcf[cls[target], ] + Fif[target, ]
Ncf[cls[target], ] <- Ccf[cls[target], ] + Fcf[cls[target], ]
}
}
# Gibbs sampler j
oldField <- field # J x C
Cjc <- t(tmp$U) %*% cls01
Fjc <- t(tmp$Z * (1 - tmp$U)) %*% cls01
jRand <- sample(1:testlength, testlength, replace = F)
for (loop in 1:testlength) {
target <- jRand[loop]
# delete selected item
Nf[field[target]] <- Nf[field[target]] - 1
Ccf[, field[target]] <- Ccf[, field[target]] - t(Cjc[target, ])
Fcf[, field[target]] <- Fcf[, field[target]] - t(Fjc[target, ])
Ncf <- Ccf + Fcf
CcfPlus <- Ccf + matrix(rep(Cjc[target, ], nfld), nrow = ncls, byrow = F)
FcfPlus <- Fcf + matrix(rep(Fjc[target, ], nfld), nrow = ncls, byrow = F)
## ## if the field disappeared...
if (Nf[field[target]] == 0) {
nfld <- nfld - 1
delFld <- field[target]
fld01 <- fld01[, -delFld]
field <- fld01 %*% (1:nfld)
Nf <- Nf[-delFld]
Ncf <- Ncf[, -delFld]
Ccf <- Ccf[, -delFld]
Fcf <- Fcf[, -delFld]
CcfPlus <- CcfPlus[, -delFld]
FcfPlus <- FcfPlus[, -delFld]
}
# Table Choice!
## Likelihood for each field
log_tab_tmp <- log((Nf / (testlength - 1 + gamma_f)) + const)
vec1 <- rep(0, nfld)
vec2 <- rep(0, nfld)
vec3 <- rep(0, nfld)
for (j in 1:nfld) {
Ctmp <- 0
Ftmp <- 0
den <- 0
for (i in 1:ncls) {
ct1 <- CcfPlus[i, j] - Ccf[i, j] - 1
if (ct1 >= 0) {
for (s in 0:ct1) {
Ctmp <- Ctmp + log(CcfPlus[i, j] + gamp - 1 - s + const)
}
}
ct2 <- FcfPlus[i, j] - Fcf[i, j] - 1
if (ct2 >= 0) {
for (s in 0:ct2) {
Ftmp <- Ftmp + log(FcfPlus[i, j] + gamp - 1 - s + const)
}
}
ct3 <- CcfPlus[i, j] + FcfPlus[i, j] - Ccf[i, j] - Fcf[i, j] - 1
if (ct3 >= 0) {
for (s in 0:ct3) {
den <- den + (log(CcfPlus[i, j] + FcfPlus[i, j] + 2 * gamp - 1 - s + const))
}
}
}
vec1[j] <- Ctmp
vec2[j] <- Ftmp
vec3[j] <- den
}
log_tab <- log_tab_tmp + vec1 + vec2 - vec3
minLog <- min(log_tab)
log_exp_tab <- log_tab - minLog
maxLogExp <- max(log_exp_tab)
exptab <- exp(log_exp_tab - maxLogExp)
ptab <- exptab / sum(exptab)
ptab <- round(ptab + const, digits = -log10(const))
selected_fld <- rmultinom(1, 1, ptab)
fld01[target, ] <- selected_fld
field[target] <- which.max(selected_fld)
Nf[field[target]] <- Nf[field[target]] + 1
Ccf[, field[target]] <- Ccf[, field[target]] + t(Cjc[target, ])
Fcf[, field[target]] <- Fcf[, field[target]] + t(Fjc[target, ])
Ncf[, field[target]] <- Ccf[, field[target]] + Fcf[, field[target]]
}
limit_count <- if (sum(abs(oldField - field)) == 0) {
limit_count <- limit_count + 1
} else {
limit_count <- 0
}
if (verbose) {
message(
sprintf(
"\r%-80s",
paste0(
"iter ", iter, " Exact match count of field elements. ",
limit_count, " nfld ", nfld, " ncls ", ncls
)
),
appendLF = FALSE
)
}
if (limit_count == stable_limit || iter == max_iter) {
IRM_FLG <- FALSE
} else {
iter <- iter + 1
}
}
# Pcf update
ret <- Ccf_Fcf_Pcf(tmp$U, tmp$Z, cls01, fld01, gamp, zeroG, fullG, type = 1)
Ccf <- ret$Ccf
Fcf <- ret$Fcf
Ncf <- ret$Ncf
Pcf <- ret$Pcf
# model Fit
llm <- sum(Ccf * log(Pcf + const) + Fcf * log(1 - Pcf + const))
df <- nobs * testlength - ncls * nfld
bic <- -2 * llm - df * log(nobs)
indices <- c(llm, df, bic)
# sort
ret <- Pcf_sort(Pcf, cls01, fld01, ncls, nfld)
Pcf <- ret$Pcf
cls01 <- ret$cls01
fld01 <- ret$fld01
cls <- ret$cls
field <- ret$field
### Pcf Update
Cif <- tmp$U %*% fld01
Fif <- (tmp$Z * (1 - tmp$U)) %*% fld01
ret <- Ccf_Fcf_Pcf(tmp$U, tmp$Z, cls01, fld01, gamp, zeroG, fullG, type = 0)
Ccf <- ret$Ccf
Fcf <- ret$Fcf
Ncf <- ret$Ncf
Pcf <- ret$Pcf
# Reorganizing small-sized classses -------------------------------
zero_position <- which(rowSums(tmp$Z * tmp$U, na.rm = T) == 0)
full_position <- which(rowSums(tmp$Z * tmp$U, na.rm = T) == testlength)
delt <- 0
delrep <- ncls
DelRepFLG <- TRUE
while (DelRepFLG) {
bestfit <- 10^10
Nc <- colSums(cls01)
NcTable <- matrix(Nc)
NcTable <- cbind(NcTable, 1:ncls)
if (zeroG == 1) {
NcTable <- NcTable[-1, ]
}
if (fullG == 1) {
NcTable <- NcTable[-nrow(NcTable), ]
}
minclass <- NcTable[order(NcTable[, 1]), ][1, ]
if (minclass[1] < minSize) {
delt <- delt + 1
if (verbose) {
message(
"The minimum class member count is under the setting value.\n",
"bic ", format(bic, digits = 6), " nclass ", ncls
)
}
} else {
DelRepFLG <- FALSE
break
}
ncls <- ncls - 1
delc <- minclass[2]
Pcf <- Pcf[-delc, ]
EMt <- 1
zeta <- log(rep(1, ncls) / ncls)
EMrepFLG <- TRUE
while (EMrepFLG) {
log_num_Zic <- Cif %*% t(log(Pcf + const)) + Fif %*% t(log(1 - Pcf + const)) + matrix(rep(zeta, nobs), nrow = nobs)
numeZic <- exp(log_num_Zic)
if (zeroG == 1) {
numeZic[, 1] <- rep(0, nobs)
}
if (fullG == 1) {
numeZic[, ncol(numeZic)] <- rep(0, nobs)
}
denomZi <- rowSums(numeZic)
Zic <- numeZic / denomZi
Zic_max_list <- apply(Zic, 1, max)
cls01 <- sign(Zic - Zic_max_list) + 1
zero_scorer <- rep(0, ncls)
zero_scorer[1] <- 1
full_scorer <- rep(0, ncls)
full_scorer[ncls] <- 1
cls01[zero_position, ] <- matrix(rep(zero_scorer, length(zero_position)), nrow = length(zero_position), byrow = T)
cls01[full_position, ] <- matrix(rep(full_scorer, length(full_position)), nrow = length(full_position), byrow = T)
cls <- cls01 %*% (1:ncls)
ret <- Ccf_Fcf_Pcf(tmp$U, tmp$Z, cls01, fld01, gamp, zeroG, fullG, type = 0)
Ccf <- ret$Ccf
Fcf <- ret$Fcf
Ncf <- ret$Ncf
Pcf <- ret$Pcf
llm <- sum(Ccf * log(Pcf + const) + Fcf * log(1 - Pcf + const))
df <- nobs * testlength - ncls * nfld
bic <- -2 * llm - df * log(nobs)
indices <- c(llm, df, bic)
fit <- bic
if (fit < bestfit) {
EMt <- EMt + 1
bestfit <- fit
best_index <- indices
bestPcf <- Pcf
bestclass <- class
} else {
Pcf <- bestPcf
class <- bestclass
EMrepFLG <- FALSE
break
}
if (EMt >= EM_limit) {
stop("The EM algorithm has reached its limit. It may not have converged.")
}
}
}
# Model Fit Finally -----------------------------------------------
ncls <- max(cls)
nfld <- max(field)
cls01 <- matrix(0, ncol = ncls, nrow = nobs)
for (i in 1:nobs) {
cls01[i, cls[i]] <- 1
}
colnames(cls01) <- paste("Class", 1:ncls)
rownames(cls01) <- tmp$ID
fld01 <- matrix(0, nrow = testlength, ncol = nfld)
for (i in 1:testlength) {
fld01[i, field[i]] <- 1
}
colnames(fld01) <- paste("Field", 1:nfld)
rownames(fld01) <- tmp$ItemLabel
ret <- Ccf_Fcf_Pcf(tmp$U, tmp$Z, cls01, fld01, gamp, zeroG, fullG, type = 1)
Ccf <- ret$Ccf
Fcf <- ret$Fcf
Ncf <- ret$Ncf
Pcf <- ret$Pcf
llm <- sum(Ccf * log(Pcf + const) + Fcf * log(1 - Pcf + const))
FitIndices <- TestFit(tmp$U, tmp$Z, llm, ncls * nfld)
# Sort ------------------------------------------------------------
ret <- Pcf_sort(Pcf, cls01, fld01, ncls, nfld)
Pcf <- ret$Pcf
cls01 <- ret$cls01
fld01 <- ret$fld01
cls <- ret$cls
field <- ret$field
# Output ---------------------------------------------------------
pifr <- t(Pcf)
flddist <- colSums(fld01)
clsdist <- colSums(cls01)
TRP <- colSums(pifr * flddist)
ret <- structure(list(
U = U,
testlength = testlength,
msg = "Class",
nobs = nobs,
Nclass = ncls,
Nfield = nfld,
EM_Cycle = EMt,
LFD = flddist,
LCD = clsdist,
FRP = pifr,
TRP = TRP,
FieldEstimated = field,
ClassEstimated = cls,
TestFitIndices = FitIndices
), class = c("exametrika", "IRM"))
return(ret)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.