R/coefbootstrap.R

Defines functions print.bootstrap coefbootstrap.alm coefbootstrap.lm coefbootstrap.default coefbootstrap

Documented in coefbootstrap coefbootstrap.alm coefbootstrap.lm

#' Bootstrap for parameters of models
#'
#' The function does the bootstrap for parameters of models and returns covariance matrix
#' together with the original bootstrapped data.
#'
#' The function applies the same model as in the provided object on a smaller sample in
#' order to get the estimates of parameters and capture the uncertainty about them. This is
#' a simple implementation of the case resampling, which assumes that the observations are
#' independent.
#'
#' @param object The model estimated using either lm, or alm, or glm.
#' @param nsim Number of iterations (simulations) to run.
#' @param size A non-negative integer giving the number of items to choose (the sample size),
#' passed to \link[base]{sample} function in R. If not provided and model contains ARIMA
#' components, this value will be selected at random on each iteration.
#' @param replace Should sampling be with replacement? Also, passed to \link[base]{sample}
#' function in R.
#' @param prob A vector of probability weights for obtaining the elements of the vector
#' being sampled. This is passed to the \link[base]{sample} as well.
#' @param parallel Either a logical, specifying whether to do the calculations in parallel,
#' or the number, specifying the number of cores to use for the parallel calculation.
#'
#' @return Class "bootstrap" is returned, which contains:
#' \itemize{
#' \item vcov - the covariance matrix of parameters;
#' \item coefficients - the matrix with the bootstrapped coefficients.
#' \item nsim - number of runs done;
#' \item size - the sample size used in the bootstrap;
#' \item replace - whether the sampling was done with replacement;
#' \item prob - a vector of probability weights used in the process;
#' \item parallel - whether the calculations were done in parallel;
#' \item model - the name of the model used (the name of the function);
#' \item timeElapsed - the time that was spend on the calculations.
#' }
#'
#' @template author
#' @template keywords
#'
#' @seealso \code{\link[greybox]{alm}}
#'
#' @examples
#' # An example with ALM
#' ourModel <- alm(mpg~., mtcars, distribution="dlnorm", loss="HAM")
#' # A fast example with 10 iterations. Use at least 1000 to get better results
#' coefbootstrap(ourModel, nsim=10)
#'
#' @rdname coefbootstrap
#' @export
coefbootstrap <- function(object, nsim=1000, size=floor(0.75*nobs(object)),
                          replace=FALSE, prob=NULL, parallel=FALSE) UseMethod("coefbootstrap")

#' @export
coefbootstrap.default <- function(object, nsim=1000, size=floor(0.75*nobs(object)),
                                  replace=FALSE, prob=NULL, parallel=FALSE){

    startTime <- Sys.time();

    # Form the call for the function
    if(!is.null(object$call)){
        newCall <- object$call;
    }
    else{
        stop("In order for the function to work, the object should have the variable 'call' in it. Cannot proceed.",
             call.=FALSE);
    }

    if(is.numeric(parallel)){
        nCores <- parallel;
        parallel <- TRUE;
    }
    else if(is.logical(parallel) && parallel){
        # Detect number of cores for parallel calculations
        nCores <- min(parallel::detectCores() - 1, nsim);
    }

    # If they asked for parallel, make checks and try to do that
    if(parallel){
        if(!requireNamespace("foreach", quietly = TRUE)){
            stop("In order to run the function in parallel, 'foreach' package must be installed.", call. = FALSE);
        }
        if(!requireNamespace("parallel", quietly = TRUE)){
            stop("In order to run the function in parallel, 'parallel' package must be installed.", call. = FALSE);
        }

        # Check the system and choose the package to use
        if(Sys.info()['sysname']=="Windows"){
            if(requireNamespace("doParallel", quietly = TRUE)){
                cluster <- parallel::makeCluster(nCores);
                doParallel::registerDoParallel(cluster);
            }
            else{
                stop("Sorry, but in order to run the function in parallel, you need 'doParallel' package.",
                     call. = FALSE);
            }
        }
        else{
            if(requireNamespace("doMC", quietly = TRUE)){
                doMC::registerDoMC(nCores);
                cluster <- NULL;
            }
            else if(requireNamespace("doParallel", quietly = TRUE)){
                cluster <- parallel::makeCluster(nCores);
                doParallel::registerDoParallel(cluster);
            }
            else{
                stop("Sorry, but in order to run the function in parallel, you need either 'doMC' (prefered) or 'doParallel' packages.",
                     call. = FALSE);
            }
        }
    }

    # Coefficients of the model
    coefficientsOriginal <- coef(object);
    nVariables <- length(coefficientsOriginal);
    variablesNames <- names(coefficientsOriginal);
    interceptIsNeeded <- any(variablesNames=="(Intercept)");
    obsInsample <- nobs(object);

    # The matrix with coefficients
    coefBootstrap <- matrix(0, nsim, nVariables, dimnames=list(NULL, variablesNames));
    # Indices for the observations to use and the vector of subsets
    indices <- c(1:obsInsample);

    if(!parallel){
        for(i in 1:nsim){
            newCall$subset <- sample(indices,size=size,replace=replace,prob=prob);
            testModel <- suppressWarnings(eval(newCall));
            coefBootstrap[i,names(coef(testModel))] <- coef(testModel);
        }
    }
    else{
        # We don't do rbind for security reasons - in order to deal with skipped variables
        coefBootstrapParallel <- foreach::`%dopar%`(foreach::foreach(i=1:nsim),{
            newCall$subset <- sample(indices,size=size,replace=replace,prob=prob);
            testModel <- eval(newCall);
            return(coef(testModel));
        })
        # Prepare the matrix with parameters
        for(i in 1:nsim){
            coefBootstrap[i,names(coefBootstrapParallel[[i]])] <- coefBootstrapParallel[[i]];
        }
    }
    # Get rid of NAs. They mean "zero"
    coefBootstrap[is.na(coefBootstrap)] <- 0;

    # Centre the coefficients for the calculation of the vcov
    coefvcov <- coefBootstrap - matrix(coefficientsOriginal, nsim, nVariables, byrow=TRUE);

    return(structure(list(vcov=(t(coefvcov) %*% coefvcov)/nsim,
                          coefficients=coefBootstrap,
                          nsim=nsim, size=size, replace=replace, prob=prob, parallel=parallel,
                          model=object$call[[1]], timeElapsed=Sys.time()-startTime),
                     class="bootstrap"));
}

#' @rdname coefbootstrap
#' @export
coefbootstrap.lm <- function(object, nsim=1000, size=floor(0.75*nobs(object)),
                              replace=FALSE, prob=NULL, parallel=FALSE){
    return(coefbootstrap.default(object, nsim, size, replace, prob, parallel));
}

#' @rdname coefbootstrap
#' @export
coefbootstrap.alm <- function(object, nsim=1000, size=floor(0.75*nobs(object)),
                              replace=FALSE, prob=NULL, parallel=FALSE){

    startTime <- Sys.time();

    cl <- match.call();

    if(is.numeric(parallel)){
        nCores <- parallel;
        parallel <- TRUE;
    }
    else if(is.logical(parallel) && parallel){
        # Detect number of cores for parallel calculations
        nCores <- min(parallel::detectCores() - 1, nsim);
    }

    # If they asked for parallel, make checks and try to do that
    if(parallel){
        if(!requireNamespace("foreach", quietly = TRUE)){
            stop("In order to run the function in parallel, 'foreach' package must be installed.", call. = FALSE);
        }
        if(!requireNamespace("parallel", quietly = TRUE)){
            stop("In order to run the function in parallel, 'parallel' package must be installed.", call. = FALSE);
        }

        # Check the system and choose the package to use
        if(Sys.info()['sysname']=="Windows"){
            if(requireNamespace("doParallel", quietly = TRUE)){
                cluster <- parallel::makeCluster(nCores);
                doParallel::registerDoParallel(cluster);
            }
            else{
                stop("Sorry, but in order to run the function in parallel, you need 'doParallel' package.",
                     call. = FALSE);
            }
        }
        else{
            if(requireNamespace("doMC", quietly = TRUE)){
                doMC::registerDoMC(nCores);
                cluster <- NULL;
            }
            else if(requireNamespace("doParallel", quietly = TRUE)){
                cluster <- parallel::makeCluster(nCores);
                doParallel::registerDoParallel(cluster);
            }
            else{
                stop("Sorry, but in order to run the function in parallel, you need either 'doMC' (prefered) or 'doParallel' packages.",
                     call. = FALSE);
            }
        }
    }

    # Coefficients of the model
    coefficientsOriginal <- coef(object);
    nVariables <- length(coefficientsOriginal);
    variablesNames <- names(coefficientsOriginal);
    interceptIsNeeded <- any(variablesNames=="(Intercept)");
    variablesNamesMade <- make.names(variablesNames);
    if(interceptIsNeeded){
        variablesNamesMade[1] <- variablesNames[1];
    }
    obsInsample <- nobs(object);

    # The matrix with coefficients
    coefBootstrap <- matrix(0, nsim, nVariables, dimnames=list(NULL, variablesNames));
    # Indices for the observations to use and the vector of subsets
    indices <- c(1:obsInsample);

    # Form the call for alm
    newCall <- object$call;
    # Tuning for srm, to call alm() instead
    # if(is.srm(object)){
    #     newCall[[1]] <- as.name("alm");
    #     newCall$folder <- NULL;
    # }
    # This is based on the expanded data, so that we don't need to redo everything
    if(interceptIsNeeded){
        newCall$formula <- as.formula(paste0("`",colnames(object$data)[1],"`~."));
    }
    else{
        newCall$formula <- as.formula(paste0("`",colnames(object$data)[1],"`~.-1"));
    }
    newCall$data <- substitute(object$data);
    newCall$distribution <- object$distribution;
    if(object$loss=="custom"){
        newCall$loss <- object$lossFunction;
    }
    else{
        newCall$loss <- object$loss;
    }

    # Deal with ari components. Remove the columns with lagged variables
    arimaModel <- !is.null(object$other$polynomial);
    if(arimaModel){
        newCall$data <- newCall$data[,!(colnames(newCall$data) %in% names(object$other$polynomial))];
    }

    # If this is ARIMA, and the size wasn't specified, make it changeable
    if(arimaModel && is.null(cl$size)){
        changeSize <- TRUE;
    }
    else{
        changeSize <- FALSE;
    }

    newCall$fast <- TRUE;
    if(any(object$distribution==c("dchisq","dt"))){
        newCall$nu <- object$other$nu;
    }
    else if(object$distribution=="dnbinom"){
        newCall$size <- object$other$size;
    }
    else if(object$distribution=="dalaplace"){
        newCall$alpha <- object$other$alpha;
    }
    else if(object$distribution=="dfnorm"){
        newCall$sigma <- object$other$sigma;
    }
    else if(object$distribution=="dbcnorm"){
        newCall$lambdaBC <- object$other$lambdaBC;
    }
    else if(any(object$distribution==c("dgnorm","dlgnorm"))){
        newCall$shape <- object$other$shape;
    }
    newCall$occurrence <- object$occurrence;
    # Only pre-initialise the parameters if non-normal stuff is used
    if(all(object$distribution!=c("dnorm","dlnorm","dlogitnorm"))){
        newCall$B <- coef(object);
    }
    # If there is scale model, remove it
    if(is.scale(object$scale)){
        newCall$scale <- NULL;
    }

    # Function creates a random sample. Needed for dynamic models
    sampler <- function(indices,size,replace,prob,arimaModel=FALSE,changeSize=FALSE){
        if(arimaModel){
            if(changeSize){
                size[] <- floor(runif(1,nVariables,obsInsample))+1;
            }
            # This way we return the continuos sample, but with random starting point
            return(floor(runif(1,0,obsInsample-size))+c(1:size));
        }
        else{
            return(sample(indices,size=size,replace=replace,prob=prob));
        }
    }

    if(!parallel){
        for(i in 1:nsim){
            newCall$subset <- sampler(indices,size,replace,prob,arimaModel,changeSize);
            testModel <- suppressWarnings(eval(newCall));
            coefBootstrap[i,variablesNamesMade %in% names(coef(testModel))] <- coef(testModel);
        }
    }
    else{
        # We don't do rbind for security reasons - in order to deal with skipped variables
        coefBootstrapParallel <- foreach::`%dopar%`(foreach::foreach(i=1:nsim),{
            newCall$subset <- sampler(indices,size,replace,prob,arimaModel,changeSize);
            testModel <- eval(newCall);
            return(coef(testModel));
        })
        # Prepare the matrix with parameters
        for(i in 1:nsim){
            coefBootstrap[i,variablesNamesMade %in% names(coefBootstrapParallel[[i]])] <- coefBootstrapParallel[[i]];
        }
    }

    # Get rid of NAs. They mean "zero"
    coefBootstrap[is.na(coefBootstrap)] <- 0;

    # Rename the variables to the originals
    colnames(coefBootstrap) <- names(coefficientsOriginal);

    # Centre the coefficients for the calculation of the vcov
    coefvcov <- coefBootstrap - matrix(colMeans(coefBootstrap), nsim, nVariables, byrow=TRUE);

    return(structure(list(vcov=(t(coefvcov) %*% coefvcov)/(nsim-1),
                          coefficients=coefBootstrap,
                          nsim=nsim, size=size, replace=replace, prob=prob, parallel=parallel,
                          model=object$call[[1]], timeElapsed=Sys.time()-startTime),
                     class="bootstrap"));
}


#' @export
print.bootstrap <- function(x, ...){
    cat(paste0("Bootstrap for the ", x$model, " model with nsim=",x$nsim," and size=",x$size,"\n"))
    cat(paste0("Time elapsed: ",round(as.numeric(x$timeElapsed,units="secs"),2)," seconds\n"));
}

Try the greybox package in your browser

Any scripts or data that you put into this service are public.

greybox documentation built on Sept. 16, 2023, 9:07 a.m.