# R/diagnose.R In rpf: Response Probability Functions

#### Documented in ChenThissen1997crosstabTestmultinomialFitordinal.gammaptw2011.gof.testrpf.1dim.fitrpf.1dim.momentrpf.1dim.residualrpf.1dim.stdresidualrpf.mean.inforpf.mean.info1SitemFitSitemFit1

##' Calculate cell central moments
##'
##' Popular central moments include 2 (variance) and 4 (kurtosis).
##'
##' @param spec list of item models
##' @param params data frame of item parameters, 1 per row
##' @param scores model derived person scores
##' @param m which moment
##' @return moment matrix
##' @docType methods
rpf.1dim.moment <- function(spec, params, scores, m) {
out <- array(dim=c(length(scores), length(spec)))
for (ix in 1:length(spec)) {
i <- spec[[ix]]
prob <- t(rpf.prob(i, params[,ix], scores))  # remove t() TODO
Escore <- apply(prob, 1, function(r) sum(r * 0:(i@outcomes-1)))
grid <- t(array(0:(i@outcomes-1), dim=c(i@outcomes, length(scores))))
out[,ix] <- apply((grid - Escore)^m * prob, 1, sum)
}
out
}

##' Calculate residuals
##'
##' @param spec list of item models
##' @param params data frame of item parameters, 1 per row
##' @param responses persons in rows and items in columns
##' @param scores model derived person scores
##' @return residuals
##' @docType methods
rpf.1dim.residual <- function(spec, params, responses, scores) {
Zscore <- array(dim=c(length(scores), length(spec)))
for (ix in 1:length(spec)) {
i <- spec[[ix]]
prob <- t(rpf.prob(i, params[,ix], scores))  # remove t() TODO
Escore <- apply(prob, 1, function(r) sum(r * 0:(i@outcomes-1)))
data <- responses[,ix]
if (is.ordered(data)) {
data <- unclass(data) - 1
} else if (is.factor(data)) {
stop(paste("Column",ix,"is an unordered factor"))
}
if (length(data) != length(Escore)) stop("Length mismatch")
Zscore[,ix] <- data - Escore
}
Zscore
}

##' Calculate standardized residuals
##'
##' @param spec list of item models
##' @param params data frame of item parameters, 1 per row
##' @param responses persons in rows and items in columns
##' @param scores model derived person scores
##' @return standardized residuals
##' @docType methods
rpf.1dim.stdresidual <- function(spec, params, responses, scores) {
res <- rpf.1dim.residual(spec, params, responses, scores)
variance <- rpf.1dim.moment(spec, params, scores, 2)
res / sqrt(variance)
}

##' Calculate item and person Rasch fit statistics
##'
##' Note: These statistics are only appropriate if all discrimination
##' parameters are fixed equal and items are conditionally independent
##' cope with missing data.
##'
##' Exact distributional properties of these statistics are unknown
##' (Masters & Wright, 1997, p. 112).  For details on the calculation,
##' refer to Wright & Masters (1982, p. 100).
##'
##' The Wilson-Hilferty transformation is biased for less than 25 items.
##' Consider wh.exact=FALSE for less than 25 items.
##'
##' @template detail-group
##'
##' @param spec list of item response models \lifecycle{deprecated}
##' @param params matrix of item parameters, 1 per column \lifecycle{deprecated}
##' @param responses persons in rows and items in columns \lifecycle{deprecated}
##' @param scores model derived person scores \lifecycle{deprecated}
##' @param margin for people 1, for items 2
##' @param wh.exact whether to use the exact Wilson-Hilferty transformation
##' @param group spec, params, data, and scores can be provided in a list instead of as arguments
##' @template detail-group
##'
##' @references Masters, G. N. & Wright, B. D. (1997). The Partial
##' Credit Model. In W. van der Linden & R. K. Kambleton (Eds.),
##' \emph{Handbook of modern item response theory}
##' (pp. 101-121). Springer.
##'
##' Wilson, E. B., & Hilferty, M. M. (1931). The distribution of
##' chi-square. \emph{Proceedings of the National Academy of Sciences of the
##' United States of America,} 17, 684-688.
##'
##' Wright, B. D. & Masters, G. N. (1982). \emph{Rating Scale
##' Analysis.} Chicago: Mesa Press.
##' @family diagnostic
##' @examples
##' data(kct)
##' responses <- kct.people[,paste("V",2:19, sep="")]
##' rownames(responses) <- kct.people$NAME ##' colnames(responses) <- kct.items$NAME
##' scores <- kct.people$MEASURE ##' params <- cbind(1, kct.items$MEASURE, logit(0), logit(1))
##' rownames(params) <- kct.items$NAME ##' items<-list() ##' items[1:18] <- rpf.drm() ##' params[,2] <- -params[,2] ##' rpf.1dim.fit(items, t(params), responses, scores, 2, wh.exact=TRUE) rpf.1dim.fit <- function(spec, params, responses, scores, margin, group=NULL, wh.exact=TRUE) { if (missing(margin)) stop("Which margin?") if (!missing(group)) { spec <- group$spec
params <- group$param responses <- group$data
scores <- group$score[,1] # should not assume first score TODO } if (any(is.na(responses))) warning("Rasch fit statistics should not be used with missing data") # true? TODO if (dim(params)[2] != length(spec)) { stop(paste("spec is length", length(spec), "but param has", dim(params)[2], "columns")) } if (nrow(responses) != length(scores)) { stop(paste("nrow(responses) is", nrow(responses), "but",length(scores), "factor scores")) } if (dim(params)[2] < 25 && wh.exact) { if (missing(wh.exact)) { wh.exact <- FALSE warning("Consider wh.exact=FALSE for less than 25 items") } } if (dim(params)[2] > 25 && !wh.exact) { if (missing(wh.exact)) { wh.exact <- TRUE warning("Consider wh.exact=TRUE for more than 25 items") } } exclude.col <- c() outcomes <- sapply(spec, function(s) s@outcomes) for (ix in 1:dim(responses)[2]) { kat <- sum(table(responses[,ix]) > 0) if (kat != outcomes[ix]) { exclude.col <- c(exclude.col, ix) warning(paste("Excluding item", colnames(responses)[ix], "because outcomes !=", outcomes[ix])) } } if (length(exclude.col)) { responses <- responses[,-exclude.col] spec <- spec[-exclude.col] params <- params[,-exclude.col] outcomes <- outcomes[-exclude.col] } exclude.row <- c() for (ix in 1:dim(responses)[1]) { r1 <- sapply(responses[ix,], unclass) if (any(is.na(r1))) next if (all(r1 == 1) || all(r1 == outcomes)) { exclude.row <- c(exclude.row, ix) warning(paste("Excluding response", rownames(responses)[ix], "because it is a minimum or maximum")) } } if (length(exclude.row)) { responses <- responses[-exclude.row,] scores <- scores[-exclude.row] } na.rm=TRUE r.z <- rpf.1dim.stdresidual(spec, params, responses, scores) r.var <- rpf.1dim.moment(spec, params, scores,2) r.var[is.na(r.z)] <- NA r.k <- rpf.1dim.moment(spec, params, scores,4) r.k[is.na(r.z)] <- NA outfit.var <- r.var outfit.var[r.var^2 < 1e-5] <- sqrt(1e-5) outfit.n <- apply(r.var, margin, function(l) sum(!is.na(l))) outfit.sd <- sqrt(apply(r.k / outfit.var^2, margin, sum, na.rm=na.rm) / outfit.n^2 - 1/outfit.n) outfit.sd[outfit.sd > 1.4142] <- 1.4142 outfit.fudge <- outfit.sd/3 infit.sd <- sqrt(apply(r.k - r.var^2, margin, sum, na.rm=na.rm)/ apply(r.var, margin, sum, na.rm=na.rm)^2) infit.sd[infit.sd > 1.4142] <- 1.4142 infit.fudge <- infit.sd/3 if (!wh.exact) { infit.fudge <- 0 outfit.fudge <- 0 } outfit <- apply(r.z^2, margin, sum, na.rm=na.rm)/ apply(r.z, margin, function (l) sum(!is.na(l))) outfit.z <- (outfit^(1/3) - 1)*(3/outfit.sd) + outfit.fudge infit <- apply(r.z^2 * r.var, margin, sum, na.rm=na.rm)/ apply(r.var, margin, sum, na.rm=na.rm) infit.z <- (infit^(1/3) - 1)*(3/infit.sd) + infit.fudge df <- data.frame(n=outfit.n, infit, infit.z, outfit, outfit.z) if (margin == 2) { df$name <- colnames(params)
} else {
df$name <- rownames(responses) } df } ##' Find the point where an item provides mean maximum information ##' ##' \lifecycle{experimental} ##' ##' @param spec an item spec ##' @param iparam an item parameter vector ##' @param grain the step size for numerical integration (optional) rpf.mean.info1 <- function(spec, iparam, grain=.1) { range <- 9 dim <- spec@factors if (dim != 1) stop("Not implemented") grid <- seq(-range, range, grain) info <- rpf.info(spec, iparam, grid) sum(info * grid) / sum(info) } ##' Find the point where an item provides mean maximum information ##' ##' \lifecycle{experimental} ##' This is a point estimate of the mean difficulty of items that do ##' not offer easily interpretable parameters such as the Generalized ##' PCM. Since the information curve may not be unimodal, this ##' function integrates across the latent space. ##' ##' @param spec list of item specs ##' @param param list or matrix of item parameters ##' @param grain the step size for numerical integration (optional) rpf.mean.info <- function(spec, param, grain=.1) { ret <- list() for (ix in 1:length(spec)) { iparam <- c() if (is.list(param)) { iparam <- param[[ix]] } else { iparam <- param[ix,] } ret[[ix]] <- rpf.mean.info1(spec[[ix]], iparam, grain) } ret } # copied from mirt collapseCells <- function(On, En, mincell = 1){ drop <- which(rowSums(is.na(En)) > 0) En[is.na(En)] <- 0 #collapse known upper and lower sparce cells if(length(drop) > 0L){ up <- drop[1L]:drop[length(drop)/2] low <- drop[length(drop)/2 + 1L]:drop[length(drop)] En[max(up)+1, ] <- colSums(En[c(up, max(up)+1), , drop = FALSE]) On[max(up)+1, ] <- colSums(On[c(up, max(up)+1), , drop = FALSE]) En[min(low)-1, ] <- colSums(En[c(low, min(low)-1), , drop = FALSE]) On[min(low)-1, ] <- colSums(On[c(low, min(low)-1), , drop = FALSE]) En[c(up, low), ] <- On[c(up, low), ] <- NA En <- na.omit(En) On <- na.omit(On) } #collapse accross if(ncol(En) > 2L){ for(j in 1L:(ncol(On)-1L)){ L <- En < mincell sel <- L[,j] if(!any(sel)) next On[sel, j+1L] <- On[sel, j] + On[sel, j+1L] En[sel, j+1L] <- En[sel, j] + En[sel, j+1L] On[sel, j] <- En[sel, j] <- NA } sel <- L[,j+1L] sel[rowSums(is.na(En[, 1L:j])) == (ncol(En)-1L)] <- FALSE put <- apply(En[sel, 1L:j, drop=FALSE], 1, function(x) max(which(!is.na(x)))) put2 <- which(sel) for(k in 1L:length(put)){ En[put2[k], put[k]] <- En[put2[k], put[k]] + En[put2[k], j+1L] En[put2[k], j+1L] <- On[put2[k], j+1L] <- NA } } L <- En < mincell L[is.na(L)] <- FALSE while(any(L)){ drop <- c() for(j in 1L:(nrow(On)-1L)){ if(any(L[j,])) { On[j+1L, L[j,]] <- On[j+1L, L[j,]] + On[j, L[j,]] En[j+1L, L[j,]] <- En[j+1L, L[j,]] + En[j, L[j,]] drop <- c(drop, j) break } } for(j in nrow(On):2L){ if(any(L[j,])) { On[j-1L, L[j,]] <- On[j-1L, L[j,]] + On[j, L[j,]] En[j-1L, L[j,]] <- En[j-1L, L[j,]] + En[j, L[j,]] drop <- c(drop, j) break } } if(nrow(On) > 4L){ for(j in 2L:(nrow(On)-1L)){ if(any(L[j,])){ On[j+1L, L[j,]] <- On[j+1L, L[j,]] + On[j, L[j,]] En[j+1L, L[j,]] <- En[j+1L, L[j,]] + En[j, L[j,]] drop <- c(drop, j) break } } } #drop if(!is.null(drop)){ En <- En[-drop, ] On <- On[-drop, ] } L <- En < mincell L[is.na(L)] <- FALSE } return(list(O=On, E=En)) } SitemFit1Internal <- function(out) { observed <- out$orig.observed
expected <- out$orig.expected mask <- apply(observed, 1, sum) != 0 observed = observed[mask,,drop=FALSE] expected = expected[mask,,drop=FALSE] if (!length(observed)) { out$statistic <- NA
out$pval <- NA return(out) } method <- out$method
if (method == "pearson") {
if (out$alt) { kc <- collapseCells(observed, expected) } else { kc <- .Call('_rpf_collapse', observed, expected, 1.0) } out$observed <- observed <- kc$O out$expected <- expected <- kc$E mask <- !is.na(expected) & expected!=0 out$statistic <- sum((observed[mask] - expected[mask])^2 / expected[mask])
out$df <- nrow(observed) * (ncol(observed) - 1) out$df <- out$df - out$free - sum(is.na(expected));
out$df[out$df < 1] <- 1
out$pval <- pchisq(out$statistic, out$df, lower.tail=FALSE, log.p=log) } else if (method == "rms") { pval <- crosstabTest(observed, expected) out$observed <- observed
out$expected <- expected if (log) { out$pval <- log(pval)
} else {
out$pval <- pval } } else { stop(paste("Method", method, "not recognized")) } out } ##' Compute the S fit statistic for 1 item ##' ##' Implements the Kang & Chen (2007) polytomous extension to ##' S statistic of Orlando & Thissen (2000). Rows with ##' missing data are ignored, but see the \code{omit} option. ##' ##' This statistic is good at finding a small number of misfitting ##' items among a large number of well fitting items. However, be ##' aware that misfitting items can cause other items to misfit. ##' ##' Observed tables cannot be computed when data is ##' missing. Therefore, you can optionally omit items with the ##' greatest number of responses missing relative to the item of ##' interest. ##' ##' Pearson is slightly more powerful than RMS in most cases I ##' examined. ##' ##' Setting \code{alt} to \code{TRUE} causes the tables to match ##' published articles. However, the default setting of \code{FALSE} ##' probably provides slightly more power when there are less than 10 ##' items. ##' ##' The name of the test, "S", probably stands for sum-score. ##' ##' @template detail-group ##' ##' @template arg-grp ##' @param item the item of interest ##' @param free the number of free parameters involved in estimating the item (to adjust the df) ##' @template arg-dots ##' @param method whether to use a pearson or rms test ##' @param log whether to return p-values in log units ##' @param qwidth \lifecycle{deprecated} ##' @param qpoints \lifecycle{deprecated} ##' @param alt whether to include the item of interest in the denominator ##' @param omit number of items to omit or a character vector with the names of the items to omit when calculating the observed and expected sum-score tables ##' @param .twotier whether to enable the two-tier optimization ##' @family diagnostic ##' @references Kang, T. and Chen, T. T. (2007). An investigation of ##' the performance of the generalized S-Chisq item-fit index for ##' polytomous IRT models. ACT Research Report Series. ##' ##' Orlando, M. and Thissen, D. (2000). Likelihood-Based ##' Item-Fit Indices for Dichotomous Item Response Theory Models. ##' \emph{Applied Psychological Measurement, 24}(1), 50-64. SitemFit1 <- function(grp, item, free=0, ..., method="pearson", log=TRUE, qwidth=6, qpoints=49L, alt=FALSE, omit=0L, .twotier=TRUE) { if (length(list(...)) > 0) { stop(paste("Remaining parameters must be passed by name", deparse(list(...)))) } if (!missing(qwidth) || !missing(qpoints)) complainAboutQuadSpec() spec <- grp$spec
c.spec <- lapply(spec, function(m) {
if (length(m@spec)==0) { stop("Item model",m,"is not implemented") }
else { m@spec }
})

param <- grp$param itemIndex <- which(item == colnames(param)) mask <- rep(TRUE, ncol(param)) if (!alt) mask[itemIndex] <- FALSE omitted <- NULL if (is.null(omit)) { # OK } else if (is.numeric(omit)) { omitted <- bestToOmit(grp, omit, item) } else if (is.character(omit)) { omitted <- omit } else { stop(paste("Not clear how to interpret omit =", omit)) } mask[match(omitted, colnames(grp$param))] <- FALSE
observed <-iobss$table max.param <- max(vapply(spec, rpf.numParam, 0)) if (nrow(param) < max.param) { stop(paste("param matrix must have", max.param ,"rows")) } Eproportion <- ot2000md(grp, itemIndex, alt, mask, .twotier) if (nrow(Eproportion) != nrow(observed)) { print(Eproportion) stop(paste("Expecting", nrow(observed), "rows in expected matrix")) } Escale <- matrix(apply(observed, 1, sum), nrow=nrow(Eproportion), ncol=ncol(Eproportion)) expected <- Eproportion * Escale names(dimnames(observed)) <- c("sumScore", "outcome") dimnames(expected) <- dimnames(observed) out <- list(orig.observed=observed, orig.expected = expected, log=log, method=method, n=iobss$n, free=free, alt=alt, omitted=omitted)

out <- SitemFit1Internal(out)
out
}

ot2000md <- function(grp, item, alt=FALSE, mask, .twotier) {
.Call('_rpf_ot2000', grp, item, alt, mask, .twotier)
}

##' Compute the S fit statistic for a set of items
##'
##' Runs \code{\link{SitemFit1}} for every item and accumulates
##' the results.
##'
##' @template detail-group
##'
##' @template arg-grp
##' @template arg-dots
##' @param method whether to use a pearson or rms test
##' @param log whether to return p-values in log units
##' @param qwidth \lifecycle{deprecated}
##' @param qpoints \lifecycle{deprecated}
##' @param alt whether to include the item of interest in the denominator
##' @param omit number of items to omit (a single number) or a list of the length the number of items
##' @param .twotier whether to enable the two-tier optimization
##' @param .parallel whether to take advantage of multiple CPUs (default TRUE)
##' @return
##' a list of output from \code{\link{SitemFit1}}
##' @family diagnostic
##' @examples
##' grp <- list(spec=list())
##' grp$spec[1:20] <- list(rpf.grm()) ##' grp$param <- sapply(grp$spec, rpf.rparam) ##' colnames(grp$param) <- paste("i", 1:20, sep="")
##' grp$mean <- 0 ##' grp$cov <- diag(1)
##' grp$free <- grp$param != 0
##' grp$data <- rpf.sample(500, grp=grp) ##' SitemFit(grp) SitemFit <- function(grp, ..., method="pearson", log=TRUE, qwidth=6, qpoints=49L, alt=FALSE, omit=0L, .twotier=TRUE, .parallel=TRUE) { if (length(list(...)) > 0) { stop(paste("Remaining parameters must be passed by name", deparse(list(...)))) } if (!missing(qwidth) || !missing(qpoints)) complainAboutQuadSpec() spec <- grp$spec
param <- grp$param if (ncol(param) != length(spec)) stop("Dim mismatch between param and spec") if (is.null(colnames(param))) stop("grp$param must have column names")

.la <- ifelse(.parallel, mclapply, lapply)
got <- .la(1:length(spec), function(interest) {
free <- 0
if (!is.null(grp$free)) free <- sum(grp$free[,interest])
itemname <- colnames(param)[interest]
omit1 <- omit
if (is.list(omit)) {
omit1 <- omit[[interest]]
}
ot.out <- SitemFit1(grp, itemname, free, method=method, log=log,
alt=alt, omit=omit1, .twotier=.twotier)
ot.out
})
lapply(got, function(d1) {
if (inherits(d1, "try-error")) stop(d1)
})
names(got) <- colnames(param)
class(got) <- "summary.SitemFit"
got
}

"+.summary.SitemFit" <- function(e1, e2) {
e2name <- deparse(substitute(e2))
if (!inherits(e2, "summary.SitemFit")) {
stop("Don't know how to add ", e2name, " to a SitemFit",
call. = FALSE)
}
if (length(e1) != length(e2)) {
stop("Cannot combine two groups with a different number of items")
}
if (any(names(e1) != names(e2))) {
stop("Cannot combine two groups with a different items")
}
if (!all(mapply(function(i1,i2){ isTRUE(all.equal(i1$omitted, i2$omitted)) }, e1, e2))) {
stop("Cannot combine two groups with different omitted items")
}

got <- mapply(function(i1, i2){
ii <- list(orig.observed = i1$orig.observed + i2$orig.observed,
orig.expected = i1$orig.expected + i2$orig.expected,
log = i1$log, method = i1$method,
n = i1$n + i2$n,
free = max(i1$free, i2$free),
alt = i1$alt) SitemFit1Internal(ii) }, e1, e2, SIMPLIFY=FALSE) class(got) <- "summary.SitemFit" got } print.summary.SitemFit <- function(x,...) { cat("Orlando & Thissen (2000) sum-score based item fit test\n") cat(" Magnitudes larger than abs(log(.01))=4.6 are significant at the p=.01 level\n\n") width <- max(sapply(names(x), nchar)) fmt <- paste("%", width, "s : n = %4d, ", sep="") for (ix in 1:length(x)) { report1 <- x[[ix]] msg <- sprintf(fmt, names(x)[ix], report1$n)
stat <- round(report1$statistic, 2) if (report1$method == "pearson") {
msg <- paste(msg, sprintf("S-X2(%3d) = %6.2f, ", report1$df, stat), sep="") } else if (report1$method == "rms") {
msg <- paste(msg, "MS=", stat, ", ", sep="")
} else {
stop(report1$method) } if (report1$log) {
msg <- paste(msg, "log(p) = ", round(report1$pval, 2), sep="") } else { msg <- paste(msg, "p = ", round(report1$pval, 4), sep="")
}
msg <- paste(msg, "\n", sep="")
cat(msg)
}
}

##' Compute the ordinal gamma association statistic
##'
##' @param mat a cross tabulation matrix
##' @references
##' Agresti, A. (1990). Categorical data analysis. New York: Wiley.
##' @examples
##' # Example data from Agresti (1990, p. 21)
##' jobsat <- matrix(c(20,22,13,7,24,38,28,18,80,104,81,54,82,125,113,92), nrow=4, ncol=4)
##' ordinal.gamma(jobsat)
ordinal.gamma <- function(mat) .Call(_rpf_gamma_cor, mat)

# root mean squared statistic (sqrt omitted)
ms <- function(observed, expected, draws) {
draws * sum((observed - expected)^2)
}

P.cdf.fn <- function(x, g.var, t) {
got <- sapply(t, function (t1) {
n <- length(g.var)
num <- exp(1-t1) * exp(1i * t1 * sqrt(n))
den <- pi * (t1 - 1/(1-1i*sqrt(n)))
pterm <- prod(sqrt(1 - 2*(t1-1)*g.var/x + 2i*t1*g.var*sqrt(n)/x))
Im(num / (den * pterm))
})
#  print(cbind(t,got))
got
}

##' Compute the P value that the observed and expected tables come from the same distribution
##'
##' \lifecycle{experimental}
##' This test is an alternative to Pearson's X^2
##' goodness-of-fit test.  In contrast to Pearson's X^2, no ad hoc cell
##' collapsing is needed to avoid an inflated false positive rate
##' in situations of sparse cell frequences.
##' The statistic rapidly converges to the Monte-Carlo estimate
##' as the number of draws increases.
##'
##' @param observed observed matrix
##' @param expected expected matrix
##' @return The P value indicating whether the two tables come from
##' the same distribution. For example, a significant result (P <
##' alpha level) rejects the hypothesis that the two matrices are from
##' the same distribution.
##' @references Perkins, W., Tygert, M., & Ward, R. (2011). Computing
##' the confidence levels for a root-mean-square test of
##' goodness-of-fit. \emph{Applied Mathematics and Computations,
##' 217}(22), 9072-9084.
##' @examples
##' draws <- 17
##' observed <- matrix(c(.294, .176, .118, .411), nrow=2) * draws
##' expected <- matrix(c(.235, .235, .176, .353), nrow=2) * draws
##' ptw2011.gof.test(observed, expected)  # not signficiant

ptw2011.gof.test <- function(observed, expected) {
orig.draws <- sum(observed)
oeDiff <- abs(sum(expected) - orig.draws)
if (is.na(oeDiff) || oeDiff > 1e-6) {
warning(paste("Total observed - total expected", oeDiff))
return(NA)
}
if (any(c(expected)==0)) {
zeros <- sum(c(expected)==0)
warning(paste("There are", zeros, "zeros in the expected distribution.",
"Did you swap the observed and expected arguments"))
return(NA)
}
observed <- observed / orig.draws
expected <- expected / orig.draws

X <- ms(observed, expected, orig.draws)
if (X == 0) return(1)

n <- length(c(observed))
D <- diag(1/c(expected))
P <- matrix(-1/n, n,n)
diag(P) <- 1 - 1/n
B <- P %*% D %*% P
g.var <- 1 / eigen(B, only.values=TRUE)$values[-n] # Eqn 8 needs n variances, but matrix B (Eqn 6) only has n-1 non-zero # eigenvalues. Perhaps n-1 degrees of freedom? # debugging: # plot(function(t) P.cdf.fn(X, g.var, t), 0, 40) # 310 points should be good enough for 500 bins # Perkins, Tygert, Ward (2011, p. 10) # If integration tolerance is too large, non-convergence can result, # http://r.789695.n4.nabble.com/Need-help-to-understand-integrate-function-td2322093.html got <- try(integrate(function(t) P.cdf.fn(X, g.var, t), 0, 40, subdivisions=310L, rel.tol=1e-10), silent=TRUE) if (inherits(got, "try-error")) return(NA) p.value <- 1 - got$value
smallest <- 6.3e-16  # approx exp(-35)
if (p.value < smallest) p.value <- smallest
p.value
}

CT1997Internal1 <- function(info, method) {
observed <- info$orig.observed expected <- info$orig.expected

s <- ordinal.gamma(observed) - ordinal.gamma(expected)
if (!is.finite(s) || is.na(s) || s==0) s <- 1
info <- c(info, sign=sign(s), gamma=s)

if (method == "pearson") {
kc <- .Call('_rpf_collapse', observed, expected, 1.0)
observed <- kc$O expected <- kc$E
df <- prod(dim(observed)-1) - kc$collapsed if (df < 1L) df <- 1L info <- c(info, list(statistic=x2, df=df, observed=observed, expected=expected)) } else if (method == "lr") { mask <- observed > 0 g2 <- -2 * sum(observed[mask] * log(expected[mask] / observed[mask])) df <- prod(dim(observed)-1) info <- c(info, statistic=g2, df=df) } info } CT1997Internal2 <- function(inames, detail) { cnames <- inames[-length(inames)] gamma <- matrix(NA, length(inames), length(cnames)) dimnames(gamma) <- list(inames, cnames) raw <- matrix(NA, length(inames), length(cnames)) dimnames(raw) <- list(inames, cnames) std <- matrix(NA, length(inames), length(cnames)) dimnames(std) <- list(inames, cnames) pval <- matrix(NA, length(inames), length(cnames)) dimnames(pval) <- list(inames, cnames) px <- 1L for (iter1 in 2:length(inames)) { for (iter2 in 1:(iter1-1)) { d1 <- detail[[px]] gamma[iter1, iter2] <- d1$gamma
s <- d1$sign stat <- d1$statistic
df <- d1$df raw[iter1, iter2] <- stat std[iter1, iter2] <- s * abs((stat - df)/sqrt(2*df)) pval[iter1, iter2] <- s * -pchisq(stat, df, lower.tail=FALSE, log.p=TRUE) px <- px + 1L } } retobj <- list(pval=pval[-1,,drop=FALSE], std=std[-1,,drop=FALSE], raw=raw[-1,,drop=FALSE], gamma=gamma[-1,,drop=FALSE], detail=detail) class(retobj) <- "summary.ChenThissen1997" retobj } tableWithWeights <- function(colpair, weights) { if (length(colpair) != 2) stop("Not a pair") l1 <- levels(colpair[[1]]) l2 <- levels(colpair[[2]]) if (1) { result <- .Call('_rpf_fast_tableWithWeights', colpair[[1]], colpair[[2]], weights) } else { result <- matrix(0.0, length(l1), length(l2)) if (nrow(colpair)) for (rx in 1:nrow(colpair)) { row <- colpair[rx,] ind <- sapply(row, unclass) w <- 1 if (length(weights)) w <- weights[rx] result[ind[1], ind[2]] <- result[ind[1], ind[2]] + w } } dimnames(result) <- list(l1, l2) names(dimnames(result)) <- colnames(colpair) result } complainAboutQuadSpec <- function() { stop("Specify qwidth and qpoints as part of the group; For example, grp$qpoints <- 31L; grp$qwidth=4") } ##' Computes local dependence indices for all pairs of items ##' ##' Item Factor Analysis makes two assumptions: (1) that the latent ##' distribution is reasonably approximated by the multivariate Normal ##' and (2) that items are conditionally independent. This test ##' examines the second assumption. The presence of locally dependent ##' items can inflate the precision of estimates causing a test to ##' seem more accurate than it really is. ##' ##' Statically significant entries suggest that the item pair has ##' local dependence. Since log(.01)=-4.6, an absolute magitude of 5 ##' is a reasonable cut-off. Positive entries indicate that the two ##' item residuals are more correlated than expected. These items may share an ##' unaccounted for latent dimension. Consider a redesign of the items ##' or the use of testlets for scoring. Negative entries indicate that ##' the two item residuals are less correlated than expected. ##' ##' @template detail-group ##' ##' @template arg-grp ##' @template arg-dots ##' @param data data \lifecycle{deprecated} ##' @param inames a subset of items to examine ##' @param qwidth \lifecycle{deprecated} ##' @param qpoints \lifecycle{deprecated} ##' @param method method to use to calculate P values. The default is the ##' Pearson X^2 statistic. Use "lr" for the similar likelihood ratio statistic. ##' @param .twotier whether to enable the two-tier optimization ##' @param .parallel whether to take advantage of multiple CPUs (default TRUE) ##' @return a list with raw, pval and detail. The pval matrix is a ##' lower triangular matrix of log P values with the sign ##' determined by relative association between the observed and ##' expected tables (see \code{\link{ordinal.gamma}}) ##' @aliases chen.thissen.1997 ##' @references Chen, W.-H. & Thissen, D. (1997). Local dependence ##' indexes for item pairs using Item Response Theory. \emph{Journal ##' of Educational and Behavioral Statistics, 22}(3), 265-289. ##' ##' Thissen, D., Steinberg, L., & Mooney, J. A. (1989). Trace lines for testlets: A use ##' of multiple-categorical-response models. \emph{Journal of Educational Measurement, ##' 26} (3), 247--260. ##' ##' Wainer, H. & Kiely, G. L. (1987). Item clusters and computerized ##' adaptive testing: A case for testlets. \emph{Journal of ##' Educational measurement, 24}(3), 185--201. ##' @family diagnostic ##' @seealso \href{https://github.com/jpritikin/ifaTools}{ifaTools} ChenThissen1997 <- function(grp, ..., data=NULL, inames=NULL, qwidth=6, qpoints=49, method="pearson", .twotier=TRUE, .parallel=TRUE) { if (length(list(...)) > 0) { stop(paste("Remaining parameters must be passed by name", deparse(list(...)))) } if (is.null(colnames(grp$param))) stop("Item parameter columns must be named")

if (missing(data)) {
data <- grp$data } if (!missing(qwidth) || !missing(qpoints)) complainAboutQuadSpec() if (method != "pearson" && method != "lr") stop(paste("Unknown method", method)) if (missing(inames)) { inames <- colnames(grp$param)
}
if (length(inames) < 2) stop("At least 2 items are required")

spec <- grp$spec if (length(spec) < dim(grp$param)[2]) {
if (dim(grp$param)[2] %% length(spec) != 0) stop("Length of spec must match # of items") rep <- dim(grp$param)[2] %/% length(spec)
while (rep > 1) {
spec <- c(spec, grp$spec) rep <- rep - 1 } } if (!is.data.frame(data)) { data <- as.data.frame(data) #safe? TODO } dataMap <- match(colnames(grp$param), colnames(data))

items <- match(inames, colnames(grp$param)) # If we move this whole loop into C then we can avoid # repeated set up of the quadrature pairs <- list() for (iter1 in 2:length(items)) { for (iter2 in 1:(iter1-1)) { pairs[[ 1+length(pairs) ]] <- c(iter1, iter2) } } .la <- ifelse(.parallel, mclapply, lapply) detail <- .la(pairs, function(pair) { iter1 <- pair[1] iter2 <- pair[2] i1 <- items[iter1] i2 <- items[iter2] obpair <- data[,dataMap[c(i1,i2)]] rowWeight <- c() if (!is.null(grp[['weightColumn']])) { rowWeight <- data[[ grp[['weightColumn']] ]] } if (!is.null(grp[['freqColumn']])) { fc <- data[[ grp[['freqColumn']] ]] if (length(rowWeight) == 0) rowWeight <- fc else rowWeight <- rowWeight * fc } observed <- tableWithWeights(obpair, rowWeight) N <- sum(observed) expected <- N * pairwiseExpected(grp, c(i1, i2), .twotier) if (any(dim(observed) != dim(expected))) { if (dim(observed)[1] != dim(expected)[1]) { bad <- i1 margin <- 1 } else { bad <- i2 margin <- 2 } Eoutcomes <- dim(expected)[margin] lev <- dimnames(observed)[[margin]] stop(paste(colnames(grp$param)[bad], " has ", Eoutcomes,
" outcomes in the model but the data has ", length(lev), " (",
paste(lev, collapse=", "),")", sep=""))
}
dimnames(expected) <- dimnames(observed)
info <- list(orig.observed=observed, orig.expected=expected)
info <- CT1997Internal1(info, method)
info
})

lapply(detail, function(d1) {
if (inherits(d1, "try-error")) stop(d1)
})

names(detail) <- sapply(pairs, function(pair) {
paste(inames[pair[1]], inames[pair[2]], sep=":")
})

retobj <- CT1997Internal2(inames, detail)
retobj$inames <- inames retobj$method <- method
retobj
}

# deprecated name
chen.thissen.1997 <- ChenThissen1997

"+.summary.ChenThissen1997" <- function(e1, e2) {
e2name <- deparse(substitute(e2))
if (!inherits(e2, "summary.ChenThissen1997")) {
stop("Don't know how to add ", e2name, " to a ChenThissen1997",
call. = FALSE)
}
if (length(e1$detail) != length(e2$detail)) {
stop("Cannot combine two groups with a different number of items")
}
if (any(names(e1$detail) != names(e2$detail))) {
stop("Cannot combine two groups with a different items")
}
method <- e1$method detail <- mapply(function(i1, i2) { ii <- list(orig.observed = i1$orig.observed + i2$orig.observed, orig.expected = i1$orig.expected + i2$orig.expected) ii <- CT1997Internal1(ii, method) }, e1$detail, e2$detail, SIMPLIFY=FALSE) retobj <- CT1997Internal2(e1$inames, detail)
retobj$inames <- e1$inames
retobj$method <- method retobj } print.summary.ChenThissen1997 <- function(x,...) { cat("Chen & Thissen (1997) local dependence test\n") cat(" Magnitudes larger than abs(log(.01))=4.6 are significant at the p=.01 level\n") cat(" A positive (negative) sign indicates more (less) observed correlation than expected\n\n") print(round(x$pval,2))
}

##' Monte-Carlo test for cross-tabulation tables
##'
##' \lifecycle{experimental}
##' This is for developers.
##'
##' @param ob observed table
##' @param ex expected table
##' @param trials number of Monte-Carlo trials
crosstabTest <- function(ob, ex, trials) {
if (missing(trials)) trials <- 10000
.Call('_rpf_crosstabTest_cpp', ob, ex, trials)
}

pairwiseExpected <- function(grp, items, .twotier=FALSE) {
.Call('_rpf_pairwiseExpected_cpp', grp, items - 1L, .twotier)
}

CaiHansen2012 <- function(grp, method, .twotier = FALSE) {
.Call('_rpf_CaiHansen2012_cpp', grp, method, .twotier)
}

##' Multinomial fit test
##'
##' For degrees of freedom, we use the number of observed statistics
##' (incorrect) instead of the number of possible response patterns
##' (correct) (see Bock, Giibons, & Muraki, 1998, p. 265). This is not
##' a huge problem because this test is becomes poorly calibrated when
##' the multinomial table is sparse. For more accurate p-values, you
##' can conduct a Monte-Carlo simulation study (see examples).
##'
##' Rows with missing data are ignored.
##'
##' The full information test is described in Bartholomew & Tzamourani
##' (1999, Section 3).
##'
##' For CFI and TLI, you must provide an \code{independenceGrp}.
##'
##' @template detail-group
##'
##' @template arg-grp
##' @param independenceGrp the independence group
##' @template arg-dots
##' @param method lr (default) or pearson
##' @param log whether to report p-value in log units
##' @param .twotier whether to use the two-tier optimization (default TRUE)
##' @references Bartholomew, D. J., & Tzamourani, P. (1999). The
##' goodness-of-fit of latent trait models in attitude
##' measurement. \emph{Sociological Methods and Research, 27}(4), 525-546.
##'
##' Bock, R. D., Gibbons, R., & Muraki, E. (1988). Full-information
##' item factor analysis. \emph{Applied Psychological Measurement, 12}(3),
##' 261-280.
##' @family diagnostic
##' @examples
##' # Create an example IFA group
##' grp <- list(spec=list())
##' grp$spec[1:10] <- rpf.grm() ##' grp$param <- sapply(grp$spec, rpf.rparam) ##' colnames(grp$param) <- paste("i", 1:10, sep="")
##' grp$mean <- 0 ##' grp$cov <- diag(1)
##' grp$uniqueFree <- sum(grp$param != 0)
##' grp$data <- rpf.sample(1000, grp=grp) ##' ##' # Monte-Carlo simulation study ##' mcReps <- 3 # increase this to 10,000 or so ##' stat <- rep(NA, mcReps) ##' for (rx in 1:mcReps) { ##' t1 <- grp ##' t1$data <- rpf.sample(grp=grp)
##'    stat[rx] <- multinomialFit(t1)$statistic ##' } ##' sum(multinomialFit(grp)$statistic > stat)/mcReps   # better p-value

multinomialFit <- function(grp, independenceGrp, ..., method="lr", log=TRUE, .twotier=TRUE) {
if (length(list(...)) > 0) {
stop(paste("Remaining parameters must be passed by name", deparse(list(...))))
}
todo <- list(grp)
if (!missing(independenceGrp)) todo <- list(grp, independenceGrp)
todo <- lapply(todo, function(gx) {
if (is.null(gx$weightColumn)) { wc <- "freq" gx$data <- compressDataFrame(gx$data, wc, .asNumeric=TRUE) gx$weightColumn <- wc
gx$observedStats <- nrow(gx$data)
}
if (is.null(gx$uniqueFree)) { warning("Number of free parameters not available; assuming 0") gx$uniqueFree <- 0
}
if (is.null(gx$observedStats)) { warning("Number of observed statistics unknown; assuming the number of possible response patterns") gx$observedStats <- prod(sapply(gx$spec, function(sp) sp$outcomes))
}
gx
})
got <- lapply(todo, CaiHansen2012, method, .twotier)
stat <- got[[1]]$stat df <- todo[[1]]$observedStats - todo[[1]]$uniqueFree out <- list() out <- c(out, list(statistic=stat, df=df)) out$pval <- pchisq(stat, out$df, lower.tail=FALSE, log.p=log) out$log <- log
out$method <- method out$n <- got[[1]]$n out$omitted <- grp$omitted if (length(todo) == 1) { fi <- computeFitStatistics(stat, df, out$n, NA, NA)
} else {
fi <- computeFitStatistics(stat, df, out$n, got[[2]]$stat, todo[[2]]$observedStats - todo[[2]]$uniqueFree)
}
for (k in names(fi)) out[[k]] <- fi[[k]]
class(out) <- "summary.multinomialFit"
out
}

print.summary.multinomialFit <- function(x,...) {
cat("Full information multinomial fit test\n")
part1 <- paste("n = ", x$n, ", ", x$method, "(", x$df, ") = ", round(x$statistic, 2), sep="")
if (x$log) { part2 <- paste("log(p) = ", round(x$pval,2), sep="")
} else {
part2 <- paste("p = ", round(x$pval,4), sep="") } cat(paste(part1, ", ", part2, sep=""), fill=TRUE) catFitStatistics(x) if (!is.null(x$omitted)) {
cat(paste("omitted: ", paste(x\$omitted, collapse=", "), "\n", sep=""))
}
}


## Try the rpf package in your browser

Any scripts or data that you put into this service are public.

rpf documentation built on Aug. 12, 2021, 1:06 a.m.