probit: Item response analysis with random effects.

Documented in anova.probit confint.probit print.probit summary.probit

#' @title The \code{probit} class and some basic methods
#'
#' @name probit-class
#' @description
#' Objects of class \code{probit} as generated by \code{\link{probit}} is a list with entries as specified below.
#' \describe{
#'   \item{\code{fixed}}{Model for the fixed effects.}
#'   \item{\code{response.name}}{Name of response in internal representation.}
#'   \item{\code{weight.name}}{Name of weight in internal representation.}
#'   \item{\code{item.name}}{Name of item identifier in the model formula.}
#'   \item{\code{items.interval}}{Character vector with names of interval responses.}
#'   \item{\code{items.ordinal}}{Character vector with names of ordinal responses.}
#'   \item{\code{ordinal.levels}}{List of character vector with levels names of ordinal variables.}
#'   \item{\code{subject}}{Name of subject identifier.}
#'   \item{\code{random}}{List of mean models for the random effects.}
#'   \item{\code{dependence}}{Text string (\code{marginal} or \code{joint}) specifying the used correlation structure.}
#'   \item{\code{m.fixed}}{\code{\link[biglm]{biglm}}-object with joint fit of fixed effects. Note that test and confidence intervals on this object does not make direct sense.}
#'   \item{\code{sigma2}}{List with estimated variances for the interval responses.}
#'   \item{\code{eta}}{List with estimated threshold parameters for the ordinal responses.}
#'   \item{\code{m.random}}{List of \code{\link[stats]{lm}}-objects for mean models on the random effects.}
#'   \item{\code{Gamma}}{Cholesky factor of estimated inverse variance of random effects.}
#'   \item{\code{mu}}{Matrix (subjects, q) of estimated approximate conditional means for the random effects, where q = number of random effects.}
#'   \item{\code{psi}}{Matrix (subjects, q*(q+1)/2) of estimated Cholesky factors of approximate inverse conditional variances for the random effects.}
#'   \item{\code{B}}{Number of replications for each subject in Monte-Carlo computation in minimization step.}
#'   \item{\code{BB}}{Number of replications for each subject in Monte-Carlo computation in maximization step.}
#'   \item{\code{logL}}{Guess at \code{log(likelihood)}. Should be compared to \code{sum(F1)}.}
#'   \item{\code{F1}}{Vector with subject-wise \code{F1}-values from last minimization step.}
#'   \item{\code{pvalue}}{p-value from latest T-test comparing the stochastic optimization.}
#'   \item{\code{iter}}{Number of minimization-maximization steps.}
#'   \item{\code{code}}{Iteration status: 0=iterations halted due to p-value, 1=iterations reached maximum iterations.}
#'   \item{\code{data}}{Tibble with dataset analyzed.}
#' }
#'
#' @param x Object of class \code{probit}.
#' @param object Object of class \code{probit}.
#' @param level Approximative coverage for confidence interval. Defaults to \code{level=0.95}.
#' @param BB Number of simulations per subject in \code{anova} or \code{update}. Defaults to \code{BB=NULL}, which corresponds to \code{BB} taken from the call object.
#' @param digits Number of digits in print of \code{probit_anova} object. Defaults to \code{digits=4}.
#' @param data Date frame with data on the wide format. Defaults to \code{data=NULL}, which corresponds to \code{data} taken from the call object.
#' @param B Number of simulations in minimization step. Defaults to \code{B=NULL}, which corresponds to \code{B} taken from the call object.
#' @param maxit Maximal number of minimization-maximization steps. Defaults to \code{maxit=20}.
#' @param sig.level Significance level at which the iterative stochastic optimizations will be stopped. Defaults to \code{sig.level=0.60}.
#' @param verbose Numeric controlling amount of convergence diagnostics. Default: \code{verbose=0} corresponding to no output.
#'
#' @note \code{anova} re-simulates the underlying responses and random
#' effects for the fixed effects model. Hence the output of the top part
#' of anova table is stochastic.
#'
#' @seealso
#' \code{\link{probit}}
#'
#' @rdname probit-class
#' @export
print.probit <- function(x) {
  cat("Probit regression with",length(x$items.interval),"interval items and",length(x$items.ordinal),"ordinal items\n")
  cat("Total approximative log(likelihood)=",-sum(x$F1),"\n")
}

#' @rdname probit-class
#' @export
summary.probit <- function(object) {
  cat("Probit regression with",length(object$items.interval),"interval items,",length(object$items.ordinal),"ordinal items, and",length(object$random),"random effects.\n")
  cat("\n")
  cat("Total approximative log(likelihood)=",-sum(object$F1),"via",paste0("(",object$B,",",object$BB,")"),"replications in minimization-maximization steps.\n")
  cat("\n")
  if (object$dependence=="marginal") {
    cat("Variance of random effects (fitted as jointly independent):\n")
  } else {
    cat("Variance of random effects (fitted as jointly dependent):\n")
  }
  print(solve(t(object$Gamma)%*%(object$Gamma)))
  cat("\n")
  if (length(object$items.interval)>0) {
    cat("Variances on interval items:\n")
    print(unlist(object$sigma2))
    cat("\n")
  }
  if (length(object$items.ordinal)>0) {
    cat("Threshold parameters for ordinal items:\n")
    print(object$eta)
    cat("\n")
  }
}


#' @rdname probit-class
#' @export
confint.probit <- function(object,level=0.95) {
  bhat <- biglm:::coef.biglm(object$m.fixed)
  SE   <- sqrt(diag(biglm:::vcov.biglm(object$m.fixed)) * object$BB)
  mydf <- (object$m.fixed$df.resid + sum(!is.na(bhat)))/object$BB - sum(!is.na(bhat))

  # Coefficient table for fixed effects
  my.confint <- data.frame(variable=object$item.name,
                           estimate=bhat,
                           CI.lower=bhat+qt((1-level)/2,df=mydf)*SE,
                           CI.upper=bhat+qt(1-(1-level)/2,df=mydf)*SE,
                           SE=SE,
                           t.statistic=bhat/SE,
                           p.value=2*(1-pt(abs(bhat/SE),df=mydf)))

  # Coefficient table for random effects
  random.eff <- unlist(lapply(object$random,function(x){all.vars(x[[2]])}))
  for (i in random.eff) {
    if (length(coef(object$m.random[[i]]))>0) {
      tmp <- as.data.frame(cbind(summary(object$m.random[[i]])$coefficients,confint(object$m.random[[i]])))[,c(1,5,6,2,3,4)]
      tmp <- cbind(variable=i,tmp)
      colnames(tmp) <- colnames(my.confint)
      rownames(tmp) <- paste0(rownames(tmp),".",i)
      my.confint <- rbind(my.confint,tmp)
    }
  }

  # return result
  return(structure(my.confint,class=c("probit_confint","data.frame")))
}


#' @rdname probit-class
#' @export
anova.probit <- function(object,BB=NULL) {
  # fit lm-object in order to be able to extract ANOVA table

  # take parameters from object
  subjects   <- unique(object$data[[object$subject.name]])
  q          <- length(object$random)
  items      <- c(object$items.interval,object$items.ordinal)
  random.eff <- unlist(lapply(object$random,function(x){all.vars(x[[2]])}))
  if (is.null(BB)) BB <- object$BB

  # design matrix for Cholesky factor of inverse covariance in normal approximation
  Q <- matrix(0,q*q,q*(q+1)/2)
  Q[matrix(1:(q*q),q,q)[upper.tri(matrix(0,q,q),diag=TRUE)],] <- diag(nrow=q*(q+1)/2)

  # set-up data matrix with random input
  U <- as_tibble(cbind(rep(subjects,each=BB),matrix(0,length(subjects)*BB,q)),
                 .name_repair = "minimal")
  names(U) <- c(object$subject.name,random.eff)
  for (s in 1:length(subjects)) {
    U[U[[object$subject.name]]==subjects[s],-1] <- t(object$mu[s,] + solve(matrix(Q%*%object$psi[s,],q,q),matrix(rnorm(BB*q),q,BB)))
  }
  mydata <- full_join(as_tibble(object$data),U,by=object$subject.name)

  # simulate responses for ordinal variables
  for (i in object$items.ordinal) {
    ii  <- !is.na(mydata[[i]]); NN <- sum(ii)
    tmp <- tibble(factor(rep(i,NN),levels=items)); names(tmp) <- object$item.name
    my.offset <- predict_slim(object$m.fixed,bind_cols(mydata[ii,],tmp))
    # simulated underlying normal and insert in mydata
    tmp <- as.numeric(mydata[[i]][ii])
    mydata[,i] <- rep(as.numeric(NA),nrow(mydata))
    mydata[ii,i] <- my.offset + qnorm(
      pnorm(c(-Inf,object$eta[[i]])[tmp]-my.offset) +
        (pnorm(c(object$eta[[i]],Inf)[tmp]-my.offset) - pnorm(c(-Inf,object$eta[[i]])[tmp]-my.offset))*runif(NN)
    )
  }

  # linear regression
  mydata <- pivot_longer(mydata,all_of(items),names_to = object$item.name, values_to = object$response.name)
  m.lm <- lm(eval(substitute(update(formula(object$fixed),y~.),list(y=as.name(object$response.name)))),
             data=mydata)

  # extract and correct anova from fixed effect model
  my.anova <- as.data.frame(anova(m.lm))
  my.anova["Residuals","Df"] <- (m.lm$df.residual + sum(!is.na(m.lm$coefficients)))/BB - sum(!is.na(m.lm$coefficients))
  my.anova[,"Sum Sq"] <- my.anova[,"Sum Sq"]/BB
  my.anova[,"Mean Sq"] <- my.anova[,"Sum Sq"]/my.anova[,"Df"]
  my.anova[-nrow(my.anova),"F value"] <- my.anova[-nrow(my.anova),"Mean Sq"] / my.anova[nrow(my.anova),"Mean Sq"]
  my.anova[-nrow(my.anova),"Pr(>F)"] <- 1-pf(my.anova[-nrow(my.anova),"F value"],df1=my.anova[-nrow(my.anova),"Df"],df2=my.anova[nrow(my.anova),"Df"])
  my.anova <- cbind(variable=object$item.name,my.anova)
  # extract anova's for random effect models
  for (i in random.eff) {
    tmp <- cbind(variable=i,as.data.frame(anova(object$m.random[[i]])))
    rownames(tmp)[nrow(tmp)] <- paste0("Residuals.",i)
    my.anova <- rbind(my.anova,tmp)
  }

  # return
  return(structure(my.anova,class=c("probit_anova","data.frame")))
}

#' @rdname probit-class
#' @export
print.probit_confint <- function(x,digits=4) {
  if (!is.null(digits)) x[,-1] <- round(x[,-1],digits=digits)
  tmp <- rownames(x)
  x <- as.data.frame(apply(x,2,function(y){ifelse(is.na(y),".",as.character(y))})); rownames(x) <- tmp
  skip <- data.frame(V1=paste(rep("-",max(8,nchar(x$variable))),collapse=""),
                     V2=paste(rep("-",max(8,nchar(x$estimate))),collapse=""),
                     V3=paste(rep("-",max(8,nchar(x$CI.lower))),collapse=""),
                     V4=paste(rep("-",max(8,nchar(x$CI.upper))),collapse=""),
                     V5=paste(rep("-",max(2,nchar(x$SE))),collapse=""),
                     V6=paste(rep("-",max(11,nchar(x$t.statistic))),collapse=""),
                     V6=paste(rep("-",max(7,nchar(x$p.value))),collapse=""))
  rownames(skip) <- ""; colnames(skip) <- colnames(x)
  tmp <- which(c("",x$variable)!=c(x$variable,""))
  y <- skip; for (i in 2:length(tmp)) {
    rownames(skip) <- paste(c(" ",rownames(skip)),collapse="")
    y <- rbind(y,x[tmp[i-1]:(tmp[i]-1),],skip)
  }
  print(y)
}

#' @rdname probit-class
#' @export
print.probit_anova <- function(x,digits=4) {
  if (!is.null(digits)) x[,-1] <- round(x[,-1],digits=digits)
  tmp <- rownames(x)
  x <- as.data.frame(apply(x,2,function(y){ifelse(is.na(y),".",as.character(y))})); rownames(x) <- tmp
  skip <- data.frame(V1=paste(rep("-",max(8,nchar(x$variable))),collapse=""),
                     V2=paste(rep("-",max(2,nchar(x$Df))),collapse=""),
                     V3=paste(rep("-",max(6,nchar(x$'Sum Sq'))),collapse=""),
                     V4=paste(rep("-",max(7,nchar(x$'Mean Sq'))),collapse=""),
                     V5=paste(rep("-",max(7,nchar(x$'F value'))),collapse=""),
                     V6=paste(rep("-",max(6,nchar(x$'Pr(>F)'))),collapse=""))
  rownames(skip) <- ""; colnames(skip) <- colnames(x)
  tmp <- which(c("",x$variable)!=c(x$variable,""))
  y <- skip; for (i in 2:length(tmp)) {
    rownames(skip) <- paste(c(" ",rownames(skip)),collapse="")
    y <- rbind(y,x[tmp[i-1]:(tmp[i]-1),],skip)
  }
  print(y)
}


# #' @rdname probit-class
# #' @export
# update.probit <- function(object,fixed=NULL,random=NULL,dependence=NULL,data=NULL,B=NULL,BB=NULL,maxit=20,sig.level=0.6,verbose=0) {
#   # update fixed effects?
#   if (is.null(fixed)) {fixed <- formula(object$fixed)} else {
#     fixed <- update(formula(object$fixed),fixed)
#   }
#
#   # update random effects?
#   if (is.null(random)) {random <- object$random} else {
#     new_random   <- object$random
#     ranef_update <- sapply(random,function(x){all.vars(x[[2]])})
#     random_ii    <- match(ranef_update,sapply(new_random,function(x){all.vars(x[[2]])}))
#     if (any(is.na(random_ii))) stop("Random effects to be updated must already be present")
#     for (i in 1:length(ranef_update)) {
#       new_random[[random_ii[i]]] <- update(new_random[[random_ii[i]]],random[[i]])
#     }
#     random <- new_random
#   }
#
#   # take present values of dependence, B, BB, data?
#   if (is.null(dependence)) {dependence <- object$dependence}
#   if (is.null(B))          {B <- object$B}
#   if (is.null(BB))         {BB <- object$BB}
#   if (is.null(data))       {data <- object$data}
#
#   # fix dependence
#   if (dependence!="marginal") dependence <- "joint"
#
#   # refit and return
#   return(MM_probit(maxit,sig.level,verbose,
#                    fixed,object$response.name,object$item.name,object$items.interval,object$items.ordinal,
#                    object$subject.name,random,dependence,
#                    object$m.fixed,object$eta,
#                    object$mu,object$psi,
#                    B,BB,
#                    data))
# }

#' @rdname probit-class
#' @export
predict.probit <- function(object,re.form=TRUE) {
  # find subjects and items
  subjects <- unique(object$data[[object$subject.name]])
  items    <- c(object$items.interval,object$items.ordinal)

  # find random effects
  random.eff <- sapply(object$random,function(x){all.vars(x[[2]])})

  # if re.form=TRUE, then include predicted random effects
  if (re.form) {
    # take prediction random effects
    mu <- object$mu
  } else {
    # use m.random to predict random effects
    mu <- matrix(0,length(subjects),ncol(object$mu))
    data.short <- object$data %>% group_by(!!as.name(object$subject.name)) %>% slice_head(n=1)
    for (i in 1:ncol(object$mu)) {
      mu[,i] <- predict(object$m.random[[i]],newdata=data.short)
    }
  }

  # set-up data matrix with predicted random input
  U <- as_tibble(cbind(subjects,mu),.name_repair = "minimal")
  names(U) <- c(object$subject.name,random.eff)
  mydata   <- full_join(object$data,U,by=object$subject.name)
  for (i in object$items.ordinal) mydata[,i] <- as.numeric(as.matrix(mydata[,i]))
  mydata[,items] <- matrix(
    predict_slim(object$m.fixed,
                 pivot_longer(mydata, all_of(items),
                              names_to = object$item.name, values_to = object$response.name)),
    nrow(mydata),length(items), byrow = TRUE)

  # return prediction
  return(mydata)
}


# Hack: Define generic functions by stealing from emmeans-package.
#       Needed to make things work together with future::plan().
setGeneric("recover_data",emmeans:::recover_data)
setGeneric("emm_basis",emmeans:::emm_basis)

#' @rdname probit-class
#' @export
recover_data.probit <- function(object, ...) {
  # grab parameters
  subjects <- unique(object$data[[object$subject.name]])
  random.eff <- sapply(object$random,function(x){all.vars(x[[2]])})
  q          <- length(random.eff)

  # grab predicted random effects
  U        <- as_tibble(cbind(subjects,object$mu),.name_repair = "minimal")
  names(U) <- c(object$subject.name,random.eff)
  mydata   <- full_join(object$data,U,by=object$subject.name)

  # recode ordinal items as numeric
  for (i in object$items.ordinal) mydata[[i]] <- as.numeric(mydata[[i]])

  # make into long format
  mydata <- pivot_longer(mydata,all_of(c(object$items.interval,object$items.ordinal)),
                         names_to = object$item.name,values_to = object$response.name)

  # find data without NA's
  tbl <- get_all_vars(delete.response(terms(object$m.fixed)),
                      mydata[!is.na(mydata[[object$response.name]]),])
  attr(tbl, "terms")      <- delete.response(terms(object$m.fixed))
  attr(tbl, "predictors") <- .all.vars(delete.response(terms(object$m.fixed)))
  #attr(tbl, "responses")  <- object$response.name
  tbl
}

#' @rdname probit-class
#' @export
emm_basis.probit <- function(object, trms, xlev, grid, ...) {
  m <- model.frame(trms, grid, na.action = na.pass, xlev = xlev)
  X <- model.matrix(trms, m)
  bhat <- biglm:::coef.biglm(object$m.fixed)
  V <- biglm:::vcov.biglm(object$m.fixed) * object$BB
  # PRESUMABLY IS ESTIMABILITY TO BE UPDATED!!!
  nbasis <- matrix(NA)
  # SO HAVE A LOOK AT THAT!
  dfargs <- list(df = (object$m.fixed$df.resid + sum(!is.na(bhat)))/object$BB - sum(!is.na(bhat)))
  dffun  <- function(k, dfargs) dfargs$df
  # Return result
  list(X = X, bhat = bhat, nbasis = nbasis, V = V, dffun = dffun, dfargs = dfargs)
}