R/iidJack.R
In lavaSearch2: Tools for Model Specification in the Latent Variable Framework

Documented in iidJack iidJack.default

### iidJack.R --- 
#----------------------------------------------------------------------
## author: Brice Ozenne
## created: jun 23 2017 (09:15) 
## Version: 
## last-updated: Jan 11 2022 (16:08) 
##           By: Brice Ozenne
##     Update #: 333
#----------------------------------------------------------------------
## 
### Commentary: 
## 
### Change Log:
#----------------------------------------------------------------------
## 
### Code:

## * documentation - iidJack
#' @title Jackknife iid Decomposition from Model Object
#' @description Extract iid decomposition (i.e. influence function) from model object.
#'
#' @name iidJack
#' 
#' @param object a object containing the model.
#' @param data [data.frame] dataset used to perform the jackknife.
#' @param grouping [vector] variable defining cluster of observations that will be simultaneously removed by the jackknife.
#' @param cpus [integer >0] the number of processors to use.
#' If greater than 1, the fit of the model and the computation of the influence function for each jackknife sample is performed in parallel. 
#' @param keep.warnings [logical] keep warning messages obtained when estimating the model with the jackknife samples.
#' @param keep.error [logical]keep error messages obtained when estimating the model with the jackknife samples.
#' @param cl [cluster] a parallel socket cluster generated by \code{parallel::makeCluster}
#' that has been registered using \code{registerDoParallel}.
#' @param trace [logical] should a progress bar be used to trace the execution of the function
#' @param ... [internal] only used by the generic method.
#'
#' @return A matrix with in row the samples and in columns the parameters.
#' 
#' @examples
#' n <- 20
#' set.seed(10)
#' mSim <- lvm(c(Y1,Y2,Y3,Y4,Y5) ~ 1*eta)
#' latent(mSim) <- ~eta
#' categorical(mSim, K=2) <- ~G
#' transform(mSim, Id ~ eta) <- function(x){1:NROW(x)}
#' dW <- lava::sim(mSim, n, latent = FALSE)
#'
#' #### LVM ####
#' \dontrun{
#' m1 <- lvm(c(Y1,Y2,Y3,Y4,Y5) ~ 1*eta)
#' latent(m1) <- ~eta
#' regression(m1) <- eta ~ G
#' e <- estimate(m1, data = dW)
#'
#' iid1 <- iidJack(e)
#' iid2 <- iid(e)
#' attr(iid2, "bread") <- NULL
#'
#' apply(iid1,2,sd)
#' apply(iid2,2,sd)
#' quantile(iid2 - iid1)
#' }
#'
#'
#' @concept iid decomposition
#' @export
iidJack <- function(object,...) UseMethod("iidJack")

## * method iidJack.default
#' @rdname iidJack
#' @export
iidJack.default <- function(object, data = NULL, grouping = NULL, cpus = 1,
                            keep.warnings = TRUE, keep.error = TRUE,
                            cl = NULL, trace = TRUE, ...) {

    estimate.lvm <- lava_estimate.lvm

    ## ** test args
    init.cpus <- (cpus>1 && is.null(cl))
    if(init.cpus){
        test.package <- try(requireNamespace("foreach"), silent = TRUE)
        if(inherits(test.package,"try-error")){
            stop("There is no package \'foreach\' \n",
                 "This package is necessary when argument \'cpus\' is greater than 1 \n")
        }

        if(cpus > parallel::detectCores()){
            stop("Argument \'cpus\' is greater than the number of available CPU cores \n",
                 "available CPU cores: ",parallel::detectCores(),"\n")
        }
    }
    
    ## ** extract data
    if(is.null(data)){
        myData <- extractData(object, design.matrix = FALSE, as.data.frame = TRUE,
                              envir = parent.env(environment()))
    }else{ 
        myData <-  as.data.frame(data)
    }
    
    n.obs <- NROW(myData)
    if(any(class(object) %in% "lme")){
        getCoef <- nlme::fixef
    }else{
        getCoef <- coef
    }
    coef.x <- getCoef(object)
    names.coef <- names(coef.x)
    n.coef <- length(coef.x)

    ## ** update formula/model when defined by a variable and not in the current namespace
    if(length(object$call[[2]])==1){
        modelName <- as.character(object$call[[2]])
        if(modelName %in% ls() == FALSE){
            assign(modelName, value = evalInParentEnv(object$call[[2]]))
        }
    }

    ## ** define the grouping level for the data
    if(is.null(grouping)){
        myData$XXXgroupingXXX <- 1:n.obs
        grouping <- "XXXgroupingXXX"        
    }else{
        if(length(grouping)>1){
            stop("grouping must refer to only one variable \n")
        }
        if(grouping %in% names(myData) == FALSE){
            stop("variable defined in grouping not found in data \n")
        }
    }
    myData[,grouping] <- as.character(myData[,grouping])
    Ugrouping <- unique(myData[,grouping])
    n.group <- length(Ugrouping)

    ## ** warper
    warper <- function(i){ # i <- "31"
        newData <- subset(myData, subset = myData[[grouping]]!=i)
        xnew <- tryWithWarnings(stats::update(object, data = newData))
        if(!is.null(xnew$error)){
            xnew$value <- rep(NA, n.coef)
        }else{
            xnew$value <- getCoef(xnew$value)
        }
        return(xnew)
    }
    # warper("31")

    ## ** start parallel computation
    if(init.cpus){            
        ## define cluster
        if(trace>0){
            cl <- parallel::makeCluster(cpus, outfile = "")
        }else{
            cl <- parallel::makeCluster(cpus)
        }
        ## link to foreach
        doParallel::registerDoParallel(cl)
    }

    ## *** link to foreach
    doParallel::registerDoParallel(cl)

    ## ** parallel computations: get jackknife coef
    if(cpus>1){

        if(trace>0){
            pb <- utils::txtProgressBar(max = n.group, style = 3)          
        }
        ## *** packages/objects to export
        estimator <- as.character(object$call[[1]]) 

        vec.packages <- c("lava")
        possiblePackage <- gsub("package:","",utils::getAnywhere(estimator)$where[1])
        existingPackage <- as.character(utils::installed.packages()[,"Package"])

        ls.call <- as.list(object$call)
        test.length <- which(unlist(lapply(ls.call, length))==1)
        test.class <- which(unlist(lapply(ls.call, function(cc){
            (class(c) %in% c("numeric","character","logical")) == FALSE
        })))
        test.class <- which(unlist(lapply(ls.call, class)) %in% c("numeric","character","logical") == FALSE)
    
        indexExport <- intersect(test.class,test.length)
        toExport <- sapply(ls.call[indexExport], as.character)
    
        if(possiblePackage %in% existingPackage){
            vec.packages <- c(vec.packages,possiblePackage)
        }

        parallel::clusterCall(cl, fun = function(x){
            sapply(vec.packages, function(iP){
                suppressPackageStartupMessages(requireNamespace(iP, quietly = TRUE))
            })
        })
        
        ## *** objects to export
        if(length(object$call$data)==1){
            toExport <- c(toExport,as.character(object$call$data))
        }
        if(length(object$call$formula)==1){
            toExport <- c(toExport,as.character(object$call$formula))
        }
        if(length(object$call$fixed)==1){
            toExport <- c(toExport,as.character(object$call$fixed))        
        }
        toExport <- c(unique(toExport),"tryWithWarnings")

        ## *** parallel computation
        i <- NULL # [:for CRAN check] foreach
        resLoop <- foreach::`%dopar%`(
                                foreach::foreach(i = 1:n.group,
                                                 .export = toExport),{
                                                     
                                                     if(trace>0){utils::setTxtProgressBar(pb, i)}

                                                     return(warper(Ugrouping[i]))
                                                 })
    
        if(trace>0){close(pb)}

    }else{
        
        if(trace>0){
            test.package <- try(requireNamespace("pbapply"), silent = TRUE)
            if(inherits(test.package,"try-error")){
                stop("There is no package \'pbapply\' \n",
                     "This package is necessary when argument \'trace\' is TRUE \n")
            }
            resLoop <- pbapply::pblapply(Ugrouping, warper)
            
        }else{
            resLoop <- lapply(Ugrouping, warper)
        }
        

    }
    coefJack <- do.call(rbind, lapply(resLoop,"[[","value"))
    rownames(coefJack) <- 1:n.group
    
    ## ** end parallel computation
    if(init.cpus){
        parallel::stopCluster(cl)
    }
    
    ## ** post treatment: from jackknife coef to the iid decomposition
    # defined as (n-1)*(coef-coef(-i))
    # division by n to match output of lava, i.e. IF/n
    iidJack <- -(n.group-1)/n.group * sweep(coefJack, MARGIN = 2, STATS = coef.x, FUN = "-")
    colnames(iidJack) <- names.coef

    if(keep.warnings){
        ls.warnings <- lapply(resLoop,"[[","warnings")
        names(ls.warnings) <- 1:n.group
        ls.warnings <- Filter(Negate(is.null), ls.warnings)
        attr(iidJack,"warnings") <- ls.warnings
    }
    if(keep.error){
        ls.error <- lapply(resLoop,"[[","error")
        names(ls.error) <- 1:n.group
        ls.error <- Filter(Negate(is.null), ls.error)
        attr(iidJack,"error") <- ls.error
    }
    return(iidJack)
}
    
#----------------------------------------------------------------------
### iidJack.R ends here