#' Goodness of Fit for GPT Models
#'
#' The continuous variable of a GPT model is categorized into discrete bins to compute Pearsons
#' X^2 between the predicted and observed bin frequencies.
#'
#' @param gpt_fit a fitted GPT model (see \code{\link{gpt_fit}})
#' @param breaks a list giving the breakpoints per category or a vector, in
#' which case the same bounds are used for each category. By default,
#' model-implied quantiles are used for each category.
#' @param bins number of bins used to compute model-implied boundaries/quantiles.
# or other arguments passed to \code{\link[graphics]{hist}}.
# Can be a list of arguments if multivariate continuous data are modelled.
#' @param statistic a vector with labels of the statistic to be computed.
#' \itemize{
#' \item \code{"dn"} the Dzhaparidze-Nikulin statistic
#' \item \code{"pf"} the Pearson-Fisher test (refitting the model to the binned data)
#' \item \code{"pd"} the power-divergence statistic with parameter \code{lambda}.
#' Since the asymptotic distribution is not chi^2, this statistic requires a
#' parametric or nonparametric bootstrap (not implemented).
#' }
#' @param lambda Only relevant for \code{statistic = "pf"} and \code{"pd"}.
#' Lambda is the parameter of the power-divergence statistic by Read & Cressie (1988).
#' \code{lambda=1} is equivalent to Pearson's X^2, and \code{lambda=0} is
#' equivalent to the likelihood-ratio statistic G^2.
# ' @details
# For the number of cells, D'Agostino & Stephens (1986; p.70) recommend 2*n^2/5 where
# n is then number of observations for a univariate continuous random variable.
#'
#' @template dzhaparidze1974
#' @template read1988
#' @template dagostino1986
#' @export
test_fit <- function (gpt_fit, breaks, bins = 6, statistic = "dn", lambda = 1){
statistic <- sapply(statistic, match.arg, choices = c("dn", "pf", "pd"))
names(statistic) <- statistic
if(ncol(gpt_fit$data$y) > 1)
stop("Goodness of fit not implemented for higher dimensions of y!")
# hists <- mapply(hist, x = gpt_fit$data$y, breaks = breaks, MoreArgs = list(plot = FALSE))
J <- length(gpt_fit$gpt@mpt@cat.names) # number of categories
S1 <- length(gpt_fit$gpt@theta)
S2 <- length(gpt_fit$gpt@eta)
par <- gpt_fit$fit.grad$par
y <- gpt_fit$data$y
# model-implied quantiles (conditional on category)
if (missing(breaks) || is.null(breaks)){
pp <- seq(0, 1, length.out = bins + 1)[- c(1, bins + 1)]
catprob <- mpt.cat.prob(gpt_fit$gpt@mpt, theta = par[1:S1])
conditional_cdf <- function(y, j)
cdf(gpt_fit$gpt,
x = rep(j, length(y)),
y = matrix(y),
theta = par[1:S1],
eta = par[S1 + (1:S2)],
# const = gpt_fit$gpt@const, ## S4 class cdf("gpt") not defined with const="numeric" !
log.p = FALSE) / catprob[j]
diff_p <- function(q, j, cumprob) conditional_cdf(q, j) - cumprob
b.list <- vector("list", J)
names(b.list) <- gpt_fit$gpt@mpt@cat.names
for (j in 1:J){
for (b in 1:(bins - 1)){
b.list[[j]][b] <- uniroot(diff_p,
lower = 2 * min(y) - max(y),
upper = 2 * max(y) - min(y), j = j, cumprob = pp[b])$root
}
}
} else if (!is.list(breaks) & is.vector(breaks)){
bins <- length(breaks) + 1
b.list <- rep(list(breaks), J)
} else if (is.list(breaks)) {
if (length(breaks) != J)
stop("Length of 'breaks' must be identical to number of categories.")
b.list <- breaks
bins <- length(breaks[[1]]) + 1
} else {
stop("'breaks' must be a list or vector.")
}
y.cat <- y
for(j in 1:J){
sel <- gpt_fit$data$x == j
y.cat[sel] <- findInterval(y[sel], b.list[[j]]) + 1
}
N <- table(gpt_fit$gpt@mpt@tree.idx[gpt_fit$data$x]) # number of samples per tree
prob <- prob.bins(gpt_fit$fit.grad$par, gpt_fit$gpt, b.list)
e <- c(N[gpt_fit$gpt@mpt@tree.idx]) * prob
o <- table(x = factor(gpt_fit$data$x, seq(gpt_fit$gpt@mpt@cat.names)),
y = factor(y.cat, seq(bins)))
test <- lapply(paste0("test.", statistic), function(ss)
do.call(ss, args = list(o=o, e=e, gpt_fit = gpt_fit, b.list = b.list, lambda = lambda)))
list("test" = data.frame(do.call("rbind", test)),
"observed" = o, "expected" = e, "breaks" = b.list)
}
prob.bins <- function(par, gpt, b.list){
J <- length(gpt@mpt@cat.names) # number of categories
S1 <- length(gpt@theta) # number of theta parameters
S2 <- length(gpt@eta) # number of theta parameters
B <- length(b.list[[1]]) + 1 # number of bins
prob <- matrix(NA, J, B, dimnames = list(gpt@mpt@cat.names, paste0("bin", 1:B)))
for (j in 1:J){
prob[j,1:(B-1)] <- cdf(gpt,
x = rep(j, B - 1),
y = matrix(b.list[[j]]),
theta = par[1:S1],
eta = par[S1 + (1:S2)],
log.p = FALSE)
}
prob[,B] <- mpt.cat.prob(gpt@mpt, par[1:S1])
prob[,2:B] <- prob[,2:B] - prob[,1:(B - 1)]
prob
}
########## higher dimensions (not working)
# b.mat <- as.matrix(expand.grid(b.list))
# prob.list <- list()
# for(d in 1:length(b.list)){ # dimension of y
# prob.list[[d]] <- matrix(NA, J, B, dimnames = list(gpt@mpt@cat.names, NULL))
# for (cc in 1:J){
# prob.list[[d]][cc,] <- cdf(gpt, x = rep(cc, B[d]), y = matrix(b.list.interior[[d]], ncol = 1),
# theta = par[1:S1], eta = par[S1 + (1:S2)], log.p = FALSE)
# }
# }
# frequency table for binned continuous (y) & discrete (x) data
count.bins <- function(x, y, J, b.list){
B <- sapply(b.list, length) - 1 # number of bins per dependent variable
y.cat <- data.frame(mapply(findInterval, x = data.frame(y), vec = b.list))
for (d in 1:ncol(y.cat))
y.cat[,d] <- factor(y.cat[,d], levels = 1:B[[d]])
tab <- table(data.frame(x = factor(x, levels = 1:J), y.cat))
matrix(tab, J)
}
# Power-divergence statistic
#
# o: observed frequency (book: q = probability)
# e: expected frequency (book: p = probability)
pd <- function(o, e, lambda = 1){
pd <- switch(as.character(lambda),
"1" = sum((o - e)^2 / e), # X^2
"0" = {
log.oe <- ifelse(o == 0, 0, log(o / e))
2 * sum(o*log.oe )}, # G^2
2 * sum(o*((o/e)^lambda - 1)) / lambda / (lambda + 1))
pd
}
# check continuiuty of X^2(lambda=1) and G^2 (lambda=0):
# p <- c(.1,.2,.3,.4)
# e <- p * 50
# o <- rmultinom(1, 50, p)
# curve(sapply(x, function(l) pd(o, e,l)), .9,1.1, n=10001)
# wrapper with p-value (wrong df!)
test.pd <- function(o, e, gpt_fit = NULL, b.list = NULL, lambda = 1){
pd <- pd(o, e, lambda = lambda)
# df <- length(o) - length(gpt_fit$gpt@mpt@tree.names) - length(gpt_fit$fit.grad$par)
# pval <- pchisq(pd, df, lower.tail = FALSE)
c("statistic" = pd, "df" = NA, "p.value" = NA)
}
# multinomial loglikelihood of binned frequencies:
ll.bins <- function(par, gpt, freq, b.list){
pp <- prob.bins(par, gpt, b.list)
nlogp <- freq * log(pp)
nlogp[is.nan(nlogp) | nlogp == -Inf] <- -1e20
sum(nlogp)
}
# matrix B for Rao-Robson statistic
get.B <- function (par, gpt, b.list){
dp.dtheta <- jacobian(prob.bins, par, gpt = gpt, b.list = b.list)
p <- c(prob.bins(par, gpt = gpt, b.list = b.list))
1/sqrt(p) * dp.dtheta
}
################ Rao and Robson (1974): chi-square with df = M-1
# requires expected Fihser information of sample size 1!
#' @importFrom numDeriv jacobian
test.rr <- function(o, e, gpt_fit, b.list, lambda = NULL){
B <- get.B(gpt_fit$fit.grad$par, gpt_fit$gpt, b.list)
# information matrix of raw data (estimate of expected Fisher information):
I <- - gpt_fit$fit.grad$hess / nrow(gpt_fit$data$y)
# checks whether precision decreases with binned data:
# N <- table(gpt_fit$gpt@mpt@tree.idx[gpt_fit$data$x]) # number of samples per tree
# J <- - hessian(ll.bins, gpt_fit$fit.grad$par, gpt = gpt_fit$gpt, freq=o, b.list = b.list) # fisher info for binned data
# rbind(raw = gpt_fit$fit.grad$SE,
# invI = 1/sqrt(diag(I*nrow(gpt_fit$data$y))),
# binned = sqrt(diag(solve(J))),
# invJ = 1/sqrt(diag(J)),
# invBtB = 1/sqrt(diag(t(B) %*% B * length(o) )))
# J / length(o) - t(B) %*% B
V <- c((o - e) / sqrt(e))
# D'Agostino & Stephens:
Q <- diag(length(o)) + B %*% solve(I - t(B) %*% B) %*% t(B)
# McCulloch 1985:
# Q <- solve(diag(length(o)) + B %*% solve(- I) %*% t(B))
# Balakrishnan, Voinov & Nikulin (2013) & D'Agostino & Stephens (Version 2):
# rr <- pd(o, e, 1) - t(V) %*% B %*% solve(I - t(B) %*% B) %*% t(B) %*% V
# Nikulin & Chimitova (2017) p. 8 (1.17) & p. 13 (1.24)
# q <- sqrt(c(prob.bins(gpt_fit$fit.grad$par, gpt = gpt_fit$gpt, b.list = b.list)))
# Q <- diag(length(o)) - q %*% t(q) - B %*% solve(I) %*% t(B) # called "G" in paper
rr <- c(t(V) %*% Q %*% V)
df <- length(o) - length(gpt_fit$gpt@mpt@tree.names)
c("statistic" = rr, "df" = df, "p.value" = pchisq(rr, df, lower.tail = FALSE))
}
################## Dzhaparidze and Nikulin (1974): df = M-p-1
test.dn <- function (o, e, gpt_fit, b.list, lambda = NULL){
dn <- NA
try({
B <- get.B(gpt_fit$fit.grad$par, gpt_fit$gpt, b.list)
V <- c((o - e) / sqrt(e))
Q <- diag(length(o)) - B %*% solve(t(B) %*% B) %*% t(B)
dn <- c(t(V) %*% Q %*% V)
})
df <- length(o) - length(gpt_fit$fit.grad$par) - length(gpt_fit$gpt@mpt@tree.names)
c("statistic" = dn, "df" = df, "p.value" = pchisq(dn, df, lower.tail = FALSE))
}
################## Pearson-Fisher (re-fitting with discrete bins)
test.pf <- function(o, e, gpt_fit, b.list, lambda = lambda){
N <- table(gpt_fit$gpt@mpt@tree.idx[gpt_fit$data$x]) # number of samples per tree
# minimum X^2 estimation
pd.tmp <- function(par){
pp <- prob.bins(par, gpt = gpt_fit$gpt, b.list = b.list)
ee <- c(N[gpt_fit$gpt@mpt@tree.idx]) * pp
pd <- pd(o = o, e = ee, lambda = lambda)
if (is.na(pd) || pd == Inf) pd <- 1e20
pd
}
S1 <- length(gpt_fit$gpt@theta)
oo <- optim(gpt_fit$fit.grad$par, pd.tmp,
lower = c(rep(1e-8, S1), get.eta.lower(gpt_fit$gpt)),
upper = c(rep(1-1e-8, S1), get.eta.upper(gpt_fit$gpt)),
method = "L-BFGS-B", control=list(maxit = 5000))
if (oo$convergence != 0) warning("optim did not convergence when refitting with binned data!")
df <- length(o) - length(gpt_fit$fit.grad$par) - length(gpt_fit$gpt@mpt@tree.names)
pval <- pchisq(oo$value, df, lower.tail = FALSE)
c("statistic" = oo$value, "df" = df, "p.value" = pval)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.