R/predict_glm.R

Defines functions predict.spglm_list get_pred_spgautor_parallel get_pred_spgautor predict.spgautor get_pred_spglm predict.spglm

Documented in predict.spgautor predict.spglm predict.spglm_list

#' @param newdata_size The \code{size} value for each observation in \code{newdata}
#'   used when predicting for the binomial family.
#' @param var_correct A logical indicating whether to return the corrected prediction
#'   variances when predicting via models fit using \code{spglm()} or \code{spgautor()}. The default is
#'   \code{TRUE}.
#' @param dispersion The dispersion of assumed when computing the prediction standard errors
#'   for \code{spglm()} or \code{spgautor()} model objects when \code{family}
#'   is \code{"nbinomial"}, \code{"beta"}, \code{"Gamma"}, or \code{"inverse.gaussian"}.
#'   If omitted, the model object dispersion parameter is used.
#' @rdname predict.spmodel
#' @method predict spglm
#' @order 9
#' @export
#' @examples
#' \donttest{
#' spgmod <- spglm(presence ~ elev * strat,
#'   family = "binomial",
#'   data = moose,
#'   spcov_type = "exponential"
#' )
#' predict(spgmod, moose_preds)
#' predict(spgmod, moose_preds, interval = "prediction")
#' augment(spgmod, newdata = moose_preds, interval = "prediction")
#' }
predict.spglm <- function(object, newdata, type = c("link", "response", "terms"), se.fit = FALSE, interval = c("none", "confidence", "prediction"),
                          level = 0.95, dispersion = NULL, terms = NULL, local, var_correct = TRUE, newdata_size, na.action = na.fail, ...) {



  # match type argument so the two display
  type <- match.arg(type)

  # match interval argument so the three display
  interval <- match.arg(interval)

  # deal with newdata_size
  if (missing(newdata_size)) newdata_size <- NULL

  # deal with local
  if (missing(local)) {
    local <- NULL
  }

  # handle dispersion argument if provided
  if (!is.null(dispersion)) {
    if (object$family %in% c("binomial", "poisson") && dispersion != 1) {
      stop("dispersion is fixed at one for binomial and poisson families.", call. = FALSE)
    }
    object$coefficients$dispersion[1] <- dispersion
  }

  # error if newdata missing from arguments and object
  if (missing(newdata) && is.null(object$newdata)) {
    stop("No missing data to predict. newdata must be specified in the newdata argument or object$newdata must be non-NULL.", call. = FALSE)
  }

  # rename relevant quantities
  obdata <- object$obdata
  xcoord <- object$xcoord
  ycoord <- object$ycoord

  # write newdata if predicting missing data
  if (missing(newdata)) {
    add_newdata_rows <- TRUE
    newdata <- object$newdata
  } else {
    add_newdata_rows <- FALSE
  }

  # set newdata_size if needed
  if (is.null(newdata_size) && object$family == "binomial") {
    newdata_size <- rep(1, NROW(newdata))
  }

  # deal with local
  if (is.null(local)) {
    if (object$n > 10000) {
    # if (object$n > 5000 || NROW(newdata) > 5000) {
      local <- TRUE
      message("Because the sample size of the fitted model object exceeds 10,000, we are setting local = TRUE to perform computationally efficient approximations. To override this behavior and compute the exact solution, rerun predict() with local = FALSE. Be aware that setting local = FALSE may result in exceedingly long computational times.")
    } else {
      local <- FALSE
    }
  }

  # save spcov param vector
  spcov_params_val <- coef(object, type = "spcov")

  # save dispersion param vector
  dispersion_params_val <- as.vector(coef(object, type = "dispersion")) # remove class

  # save randcov param vector
  randcov_params_val <- coef(object, type = "randcov")

  attr_sp <- attr(class(newdata), "package")
  if (!is.null(attr_sp) && length(attr_sp) == 1 && attr_sp == "sp") {
    stop("sf objects must be used instead of sp objects. To convert your sp object into an sf object, run sf::st_as_sf().", call. = FALSE)
  }

  if (inherits(newdata, "sf")) {
    newdata <- suppressWarnings(sf::st_centroid(newdata))

    newdata <- sf_to_df(newdata)
    names(newdata)[[which(names(newdata) == ".xcoord")]] <- as.character(xcoord) # only relevant if newdata is sf data is not
    names(newdata)[[which(names(newdata) == ".ycoord")]] <- as.character(ycoord) # only relevant if newdata is sf data is not
  }

  # add back in zero column to cover anisotropy (should make anisotropy only available 1-d)
  if (object$dim_coords == 1) {
    obdata[[ycoord]] <- 0
    newdata[[ycoord]] <- 0
  }

  if (object$anisotropy) { # could just do rotate != 0 || scale != 1
    obdata_aniscoords <- transform_anis(obdata, xcoord, ycoord,
      rotate = spcov_params_val[["rotate"]],
      scale = spcov_params_val[["scale"]]
    )
    obdata[[xcoord]] <- obdata_aniscoords$xcoord_val
    obdata[[ycoord]] <- obdata_aniscoords$ycoord_val
    newdata_aniscoords <- transform_anis(newdata, xcoord, ycoord,
      rotate = spcov_params_val[["rotate"]],
      scale = spcov_params_val[["scale"]]
    )
    newdata[[xcoord]] <- newdata_aniscoords$xcoord_val
    newdata[[ycoord]] <- newdata_aniscoords$ycoord_val
  }

  formula_newdata <- delete.response(terms(object))
  # fix model frame bug with degree 2 basic polynomial and one prediction row
  # e.g. poly(x, y, degree = 2) and newdata has one row
  if (any(grepl("nmatrix.", attributes(formula_newdata)$dataClasses, fixed = TRUE)) && NROW(newdata) == 1) {
    newdata <- newdata[c(1, 1), , drop = FALSE]
    newdata_model_frame <- model.frame(formula_newdata, newdata, drop.unused.levels = FALSE, na.action = na.pass, xlev = object$xlevels)
    newdata_model <- model.matrix(formula_newdata, newdata_model_frame, contrasts = object$contrasts)
    newdata_model <- newdata_model[1, , drop = FALSE]
    # find offset
    offset <- model.offset(newdata_model_frame)
    if (!is.null(offset)) {
      offset <- offset[1]
    }
    newdata <- newdata[1, , drop = FALSE]
  } else {
    newdata_model_frame <- model.frame(formula_newdata, newdata, drop.unused.levels = FALSE, na.action = na.pass, xlev = object$xlevels)
    # assumes that predicted observations are not outside the factor levels
    newdata_model <- model.matrix(formula_newdata, newdata_model_frame, contrasts = object$contrasts)
    # find offset
    offset <- model.offset(newdata_model_frame)
  }
  attr_assign <- attr(newdata_model, "assign")
  attr_contrasts <- attr(newdata_model, "contrasts")
  keep_cols <- which(colnames(newdata_model) %in% colnames(model.matrix(object)))
  newdata_model <- newdata_model[, keep_cols, drop = FALSE]
  attr(newdata_model, "assign") <- attr_assign[keep_cols]
  attr(newdata_model, "contrasts") <- attr_contrasts

  # finding rows w/out NA
  ob_predictors <- complete.cases(newdata_model)
  if (any(!ob_predictors)) {
    stop("Cannot have NA values in predictors.", call. = FALSE)
  }

  # call terms if needed
  if (type == "terms") {
    # glm supports standard errors for terms objects but not intervals (no interval argument)
    # scale df not used for glms
    return(predict_terms(object, newdata_model, se.fit, scale = NULL, df = Inf, interval, level, add_newdata_rows, terms, ...))
  }

  # storing newdata as a list
  newdata_rows_list <- split(newdata, seq_len(NROW(newdata)))

  # storing newdata as a list
  newdata_model_list <- split(newdata_model, seq_len(NROW(newdata)))

  # storing newdata as a list
  newdata_list <- mapply(x = newdata_rows_list, y = newdata_model_list, FUN = function(x, y) list(row = x, x0 = y), SIMPLIFY = FALSE)

  if (interval %in% c("none", "prediction")) {



    # local prediction list
    local_list <- get_local_list_prediction(local)

    dotlist <- list(...)
    dotlist_names <- names(dotlist)

    if ("extra_randcov_list" %in% dotlist_names && !is.null(dotlist[["extra_randcov_list"]])) {
      extra_randcov_list <- dotlist$extra_randcov_list
    } else {
      extra_randcov_list <- get_extra_randcov_list(object, obdata, newdata)
    }
    reform_bar2_list <- extra_randcov_list$reform_bar2_list
    Z_index_obdata_list <- extra_randcov_list$Z_index_obdata_list
    reform_bar1_list <- extra_randcov_list$reform_bar1_list
    Z_val_obdata_list <- extra_randcov_list$Z_val_obdata_list

    if ("extra_partition_list" %in% dotlist_names && !is.null(dotlist[["extra_partition_list"]])) {
      extra_partition_list <- dotlist$extra_partition_list
    } else {
      extra_partition_list <- get_extra_partition_list(object, obdata, newdata)
    }
    reform_bar2 <- extra_partition_list$reform_bar2
    partition_index_obdata <- extra_partition_list$partition_index_obdata

    # # random stuff
    # if (!is.null(object$random)) {
    #   randcov_names <- get_randcov_names(object$random)
    #   # this causes a memory leak and was not even needed
    #   # randcov_Zs <- get_randcov_Zs(obdata, randcov_names)
    #   # comment out here for simple
    #   reform_bar_list <- lapply(randcov_names, function(randcov_name) {
    #     bar_split <- unlist(strsplit(randcov_name, " | ", fixed = TRUE))
    #     reform_bar2 <- reformulate(bar_split[[2]], intercept = FALSE)
    #     if (bar_split[[1]] != "1") {
    #       reform_bar1 <- reformulate(bar_split[[1]], intercept = FALSE)
    #     } else {
    #       reform_bar1 <- NULL
    #     }
    #     list(reform_bar2 = reform_bar2, reform_bar1 = reform_bar1)
    #   })
    #   reform_bar2_list <- lapply(reform_bar_list, function(x) x$reform_bar2)
    #   names(reform_bar2_list) <- randcov_names
    #   reform_bar1_list <- lapply(reform_bar_list, function(x) x$reform_bar1)
    #   names(reform_bar1_list) <- randcov_names
    #   Z_index_obdata_list <- lapply(reform_bar2_list, function(reform_bar2) {
    #     reform_bar2_mf <- model.frame(reform_bar2, obdata)
    #     reform_bar2_terms <- terms(reform_bar2_mf)
    #     reform_bar2_xlev <- .getXlevels(reform_bar2_terms, reform_bar2_mf)
    #     reform_bar2_mx <- model.matrix(reform_bar2, obdata)
    #     reform_bar2_names <- colnames(reform_bar2_mx)
    #     reform_bar2_split <- split(reform_bar2_mx, seq_len(NROW(reform_bar2_mx)))
    #     reform_bar2_vals <- reform_bar2_names[vapply(reform_bar2_split, function(y) which(as.logical(y)), numeric(1))]
    #
    #
    #     # adding dummy levels if newdata observations of random effects are not in original data
    #     # terms object is unchanged if levels change
    #     # reform_bar2_mf_new <- model.frame(reform_bar2, newdata)
    #     # reform_bar2_mf_full <- model.frame(reform_bar2, merge(obdata, newdata, all = TRUE))
    #     # reform_bar2_terms_full <- terms(rbind(reform_bar2_mf, reform_bar2_mf_new))
    #     reform_bar2_xlev_full <- .getXlevels(reform_bar2_terms, rbind(reform_bar2_mf, model.frame(reform_bar2, newdata)))
    #     if (!identical(reform_bar2_xlev, reform_bar2_xlev_full)) {
    #       reform_bar2_xlev <- reform_bar2_xlev_full
    #     }
    #
    #
    #     list(reform_bar2_vals = reform_bar2_vals, reform_bar2_xlev = reform_bar2_xlev)
    #   })
    #   # Z_index_obdata_list <- lapply(reform_bar2_list, function(reform_bar2) as.vector(model.matrix(reform_bar2, obdata)))
    #   names(Z_index_obdata_list) <- randcov_names
    #   Z_val_obdata_list <- lapply(reform_bar1_list, function(reform_bar1) {
    #     if (is.null(reform_bar1)) {
    #       return(NULL)
    #     } else {
    #       return(as.vector(model.matrix(reform_bar1, obdata)))
    #     }
    #   })
    #   names(Z_val_obdata_list) <- randcov_names
    # } else {
    #   reform_bar2_list <- NULL
    #   Z_index_obdata_list <- NULL
    #   reform_bar1_list <- NULL
    #   Z_val_obdata_list <- NULL
    # }
    #
    # # partition factor stuff
    # if (!is.null(object$partition_factor)) {
    #   partition_factor_val <- get_partition_name(labels(terms(object$partition_factor)))
    #   bar_split <- unlist(strsplit(partition_factor_val, " | ", fixed = TRUE))
    #   reform_bar2 <- reformulate(bar_split[[2]], intercept = FALSE)
    #   p_reform_bar2_mf <- model.frame(reform_bar2, obdata)
    #   p_reform_bar2_terms <- terms(p_reform_bar2_mf)
    #   p_reform_bar2_xlev <- .getXlevels(p_reform_bar2_terms, p_reform_bar2_mf)
    #   p_reform_bar2_mx <- model.matrix(reform_bar2, obdata)
    #   p_reform_bar2_names <- colnames(p_reform_bar2_mx)
    #   p_reform_bar2_split <- split(p_reform_bar2_mx, seq_len(NROW(p_reform_bar2_mx)))
    #   p_reform_bar2_vals <- p_reform_bar2_names[vapply(p_reform_bar2_split, function(y) which(as.logical(y)), numeric(1))]
    #
    #
    #   # adding dummy levels if newdata observations of random effects are not in original data
    #   # terms object is unchanged if levels change
    #   # p_reform_bar2_mf_new <- model.frame(reform_bar2, newdata)
    #   # reform_bar2_mf_full <- model.frame(reform_bar2, merge(obdata, newdata, all = TRUE))
    #   # p_reform_bar2_terms_full <- terms(rbind(p_reform_bar2_mf, p_reform_bar2_mf_new))
    #   p_reform_bar2_xlev_full <- .getXlevels(p_reform_bar2_terms, rbind(p_reform_bar2_mf, model.frame(reform_bar2, newdata)))
    #   if (!identical(p_reform_bar2_xlev, p_reform_bar2_xlev_full)) {
    #     p_reform_bar2_xlev <- p_reform_bar2_xlev_full
    #   }
    #
    #   partition_index_obdata <- list(reform_bar2_vals = p_reform_bar2_vals, reform_bar2_xlev = p_reform_bar2_xlev)
    #   # partition_index_obdata <- as.vector(model.matrix(reform_bar2, obdata))
    # } else {
    #   reform_bar2 <- NULL
    #   partition_index_obdata <- NULL
    # }

    # matrix cholesky
    if (local_list$method == "all") {
      cov_matrix_val <- covmatrix(object)
      cov_lowchol <- t(chol(cov_matrix_val))
      predvar_adjust_ind <- FALSE
      predvar_adjust_all <- TRUE
    } else {
      cov_lowchol <- NULL
      predvar_adjust_ind <- TRUE
      predvar_adjust_all <- FALSE
    }

    # change predvar adjust based on var correct
    if (!var_correct) {
      predvar_adjust_ind <- FALSE
      predvar_adjust_all <- FALSE
    }

    if (local_list$parallel) {
      cl <- parallel::makeCluster(local_list$ncores)
      pred_spglm <- parallel::parLapply(cl, newdata_list, get_pred_spglm,
        se.fit = se.fit,
        interval = interval, formula = object$terms,
        obdata = obdata, xcoord = xcoord, ycoord = ycoord,
        spcov_params_val = spcov_params_val, random = object$random,
        randcov_params_val = randcov_params_val,
        reform_bar2_list = reform_bar2_list,
        Z_index_obdata_list = Z_index_obdata_list,
        reform_bar1_list = reform_bar1_list,
        Z_val_obdata_list = Z_val_obdata_list,
        partition_factor = object$partition_factor,
        reform_bar2 = reform_bar2, partition_index_obdata = partition_index_obdata,
        cov_lowchol = cov_lowchol,
        Xmat = model.matrix(object),
        y = object$y, dim_coords = object$dim_coords,
        betahat = coefficients(object), cov_betahat = vcov(object, var_correct = FALSE),
        contrasts = object$contrasts,
        local = local_list, family = object$family, w = fitted(object, type = "link"), size = object$size,
        dispersion = dispersion_params_val, predvar_adjust_ind = predvar_adjust_ind,
        xlevels = object$xlevels, diagtol = object$diagtol
      )
      cl <- parallel::stopCluster(cl)
    } else {
      pred_spglm <- lapply(newdata_list, get_pred_spglm,
        se.fit = se.fit,
        interval = interval, formula = object$terms,
        obdata = obdata, xcoord = xcoord, ycoord = ycoord,
        spcov_params_val = spcov_params_val, random = object$random,
        randcov_params_val = randcov_params_val,
        reform_bar2_list = reform_bar2_list,
        Z_index_obdata_list = Z_index_obdata_list,
        reform_bar1_list = reform_bar1_list,
        Z_val_obdata_list = Z_val_obdata_list,
        partition_factor = object$partition_factor,
        reform_bar2 = reform_bar2, partition_index_obdata = partition_index_obdata,
        cov_lowchol = cov_lowchol,
        Xmat = model.matrix(object),
        y = object$y, dim_coords = object$dim_coords,
        betahat = coefficients(object), cov_betahat = vcov(object, var_correct = FALSE),
        contrasts = object$contrasts,
        local = local_list, family = object$family,
        w = fitted(object, type = "link"), size = object$size,
        dispersion = dispersion_params_val, predvar_adjust_ind = predvar_adjust_ind,
        xlevels = object$xlevels, diagtol = object$diagtol
      )
    }

    if (interval == "none") {
      fit <- vapply(pred_spglm, function(x) x$fit, numeric(1))
      # apply offset
      if (!is.null(offset)) {
        fit <- fit + offset
      }
      if (type == "response") {
        fit <- invlink(fit, object$family, newdata_size)
      }
      if (se.fit) {
        vars <- vapply(pred_spglm, function(x) x$var, numeric(1))
        if (predvar_adjust_all) {
          # predvar_adjust is for the local function so FALSE there is TRUE
          # here
          vars_adj <- get_wts_varw(
            family = object$family,
            Xmat = model.matrix(object),
            y = object$y,
            w = fitted(object, type = "link"),
            size = object$size,
            dispersion = dispersion_params_val,
            cov_lowchol = cov_lowchol,
            x0 = newdata_model,
            c0 = covmatrix(object, newdata)
          )
          vars <- vars_adj + vars
        }
        se <- sqrt(vars)
        if (add_newdata_rows) {
          names(fit) <- object$missing_index
          names(se) <- object$missing_index
        }
        return(list(fit = fit, se.fit = se))
      } else {
        if (add_newdata_rows) {
          names(fit) <- object$missing_index
        }
        return(fit)
      }
    }

    if (interval == "prediction") {
      fit <- vapply(pred_spglm, function(x) x$fit, numeric(1))
      # apply offset
      if (!is.null(offset)) {
        fit <- fit + offset
      }
      vars <- vapply(pred_spglm, function(x) x$var, numeric(1))
      if (predvar_adjust_all) {
        vars_adj <- get_wts_varw(
          family = object$family,
          Xmat = model.matrix(object),
          y = object$y,
          w = fitted(object, type = "link"),
          size = object$size,
          dispersion = dispersion_params_val,
          cov_lowchol = cov_lowchol,
          x0 = newdata_model,
          c0 = covmatrix(object, newdata)
        )
        vars <- vars_adj + vars
      }
      se <- sqrt(vars)
      # tstar <- qt(1 - (1 - level) / 2, df = object$n - object$p)
      tstar <- qnorm(1 - (1 - level) / 2)
      lwr <- fit - tstar * se
      upr <- fit + tstar * se
      if (type == "response") {
        fit <- invlink(fit, object$family, newdata_size)
        lwr <- invlink(lwr, object$family, newdata_size)
        upr <- invlink(upr, object$family, newdata_size)
      }
      fit <- cbind(fit, lwr, upr)
      row.names(fit) <- 1:NROW(fit)
      if (se.fit) {
        if (add_newdata_rows) {
          row.names(fit) <- object$missing_index
          names(se) <- object$missing_index
        }
        return(list(fit = fit, se.fit = se))
      } else {
        if (add_newdata_rows) {
          row.names(fit) <- object$missing_index
        }
        return(fit)
      }
    }
  } else if (interval == "confidence") {
    # finding fitted values of the mean parameters
    fit <- as.numeric(newdata_model %*% coef(object))
    # apply offset
    if (!is.null(offset)) {
      fit <- fit + offset
    }
    newdata_model_list <- split(newdata_model, seq_len(NROW(newdata_model)))
    vars <- as.numeric(vapply(newdata_model_list, function(x) crossprod(x, vcov(object) %*% x), numeric(1)))
    se <- sqrt(vars)
    # tstar <- qt(1 - (1 - level) / 2, df = object$n - object$p)
    tstar <- qnorm(1 - (1 - level) / 2)
    lwr <- fit - tstar * se
    upr <- fit + tstar * se
    if (type == "response") {
      fit <- invlink(fit, object$family, newdata_size)
      lwr <- invlink(lwr, object$family, newdata_size)
      upr <- invlink(upr, object$family, newdata_size)
    }
    fit <- cbind(fit, lwr, upr)
    row.names(fit) <- 1:NROW(fit)
    if (se.fit) {
      if (add_newdata_rows) {
        row.names(fit) <- object$missing_index
        names(se) <- object$missing_index
      }
      return(list(fit = fit, se.fit = se))
    } else {
      if (add_newdata_rows) {
        row.names(fit) <- object$missing_index
      }
      return(fit)
    }
  } else {
    stop("Interval must be none, confidence, or prediction")
  }
}

get_pred_spglm <- function(newdata_list, se.fit, interval, formula, obdata, xcoord, ycoord,
                           spcov_params_val, random, randcov_params_val, reform_bar2_list,
                           Z_index_obdata_list, reform_bar1_list, Z_val_obdata_list, partition_factor,
                           reform_bar2, partition_index_obdata, cov_lowchol,
                           Xmat, y, betahat, cov_betahat, dim_coords, contrasts, local,
                           family, w, size, dispersion, predvar_adjust_ind, xlevels, diagtol) {


  # storing partition vector
  partition_vector <- partition_vector(partition_factor,
    data = obdata,
    newdata = newdata_list$row, reform_bar2 = reform_bar2,
    partition_index_data = partition_index_obdata
  )

  # subsetting partition vector (efficient but causes problems later with
  # random effect subsetting)
  if (!is.null(partition_vector) && local$method %in% c("distance", "covariance") &&
      (is.null(random) || !labels(terms(partition_factor)) %in% labels(terms(random)))) {
    partition_index <- as.vector(partition_vector) == 1
    Z_index_obdata_list <- lapply(Z_index_obdata_list, function(x) {
      x$reform_bar2_vals <- x$reform_bar2_vals[partition_index]
      x
    })
    obdata <- obdata[partition_index, , drop = FALSE]
    partition_vector <- Matrix(1, nrow = 1, ncol = NROW(obdata))
  }

  dist_vector <- spdist_vectors(newdata_list$row, obdata, xcoord, ycoord, dim_coords)

  # # subsetting data if method distance
  # if (local$method == "distance") {
  #   n <- length(dist_vector)
  #   nn_index <- order(as.numeric(dist_vector))[seq(from = 1, to = min(n, local$size))]
  #   obdata <- obdata[nn_index, , drop = FALSE]
  #   dist_vector <- dist_vector[, nn_index]
  #   w <- w[nn_index]
  #   y <- y[nn_index]
  #   if (!is.null(size)) {
  #     size <- size[nn_index]
  #   }
  # }

  # making random vector if necessary
  if (!is.null(randcov_params_val)) {
    randcov_vector_val <- randcov_vector(randcov_params_val, obdata, newdata_list$row, reform_bar2_list, Z_index_obdata_list)
  } else {
    randcov_vector_val <- NULL
  }

  # making the covariance vector
  cov_vector_val <- cov_vector(spcov_params_val, dist_vector, randcov_vector_val, partition_vector)

  # subsetting data if method distance
  if (local$method == "distance") {
    n <- length(cov_vector_val)
    # want the smallest distance here and order goes from smallest first to largest last (keep last values with are smallest distance)
    nn_index <- order(as.numeric(dist_vector))[seq(from = 1, to = min(n, local$size))]
    obdata <- obdata[nn_index, , drop = FALSE]
    cov_vector_val <- cov_vector_val[nn_index]
    w <- w[nn_index]
    y <- y[nn_index]
    if (!is.null(size)) {
      size <- size[nn_index]
    }
  }


  if (local$method == "covariance") {
    n <- length(cov_vector_val)
    # want the largest covariance here and order goes from smallest first to largest last (keep last values which are largest covariance)
    cov_index <- order(as.numeric(cov_vector_val))[seq(from = n, to = max(1, n - local$size + 1))] # use abs() here?
    obdata <- obdata[cov_index, , drop = FALSE]
    cov_vector_val <- cov_vector_val[cov_index]
    w <- w[cov_index]
    y <- y[cov_index]
    if (!is.null(size)) {
      size <- size[cov_index]
    }
  }

  if (local$method %in% c("distance", "covariance")) {
    if (!is.null(random)) {
      randcov_names <- get_randcov_names(random)
      xlev_list <- lapply(Z_index_obdata_list, function(x) x$reform_bar2_xlev)
      randcov_Zs <- get_randcov_Zs(obdata, randcov_names, xlev_list = xlev_list)
    }
    partition_matrix_val <- partition_matrix(partition_factor, obdata)
    cov_matrix_val <- cov_matrix(
      spcov_params_val, spdist(obdata, xcoord, ycoord), randcov_params_val,
      randcov_Zs, partition_matrix_val,
      diagtol = diagtol
    )
    cov_lowchol <- t(Matrix::chol(Matrix::forceSymmetric(cov_matrix_val)))
    model_frame <- model.frame(formula, obdata, drop.unused.levels = TRUE, na.action = na.pass, xlev = xlevels)
    Xmat <- model.matrix(formula, model_frame, contrasts = contrasts)
  }



  c0 <- as.numeric(cov_vector_val)
  SqrtSigInv_X <- forwardsolve(cov_lowchol, Xmat)
  SqrtSigInv_w <- forwardsolve(cov_lowchol, w)
  residuals_pearson <- SqrtSigInv_w - SqrtSigInv_X %*% betahat
  SqrtSigInv_c0 <- forwardsolve(cov_lowchol, c0)
  x0 <- newdata_list$x0

  fit <- as.numeric(x0 %*% betahat + Matrix::crossprod(SqrtSigInv_c0, residuals_pearson))
  H <- x0 - Matrix::crossprod(SqrtSigInv_c0, SqrtSigInv_X)
  if (se.fit || interval == "prediction") {
    total_var <- sum(spcov_params_val[["de"]], spcov_params_val[["ie"]], randcov_params_val)
    var <- as.numeric(total_var - Matrix::crossprod(SqrtSigInv_c0, SqrtSigInv_c0) + H %*% Matrix::tcrossprod(cov_betahat, H))
    if (predvar_adjust_ind) {
      var_adj <- get_wts_varw(family, Xmat, y, w, size, dispersion, cov_lowchol, x0, c0)
      var <- var_adj + var
    }
    pred_list <- list(fit = fit, var = var)
  } else {
    pred_list <- list(fit = fit)
  }
  pred_list
}







#' @rdname predict.spmodel
#' @method predict spgautor
#' @order 10
#' @export
predict.spgautor <- function(object, newdata, type = c("link", "response", "terms"), se.fit = FALSE,
                             interval = c("none", "confidence", "prediction"),
                             level = 0.95, dispersion = NULL, terms = NULL, local, var_correct = TRUE, newdata_size, na.action = na.fail, ...) {

  # match type argument so the two display
  type <- match.arg(type)

  # match interval argument so the three display
  interval <- match.arg(interval)

  # deal with newdata_size
  if (missing(newdata_size)) newdata_size <- NULL

  # deal with local
  if (missing(local)) {
    local <- NULL
  }

  # handle dispersion argument if provided
  if (!is.null(dispersion)) {
    if (object$family %in% c("binomial", "poisson") && dispersion != 1) {
      stop("dispersion is fixed at one for binomial and poisson families.", call. = FALSE)
    }
    object$coefficients$dispersion[1] <- dispersion
  }

  # error if newdata missing from arguments and object
  if (missing(newdata) && is.null(object$newdata)) {
    stop("No missing data to predict. newdata must be specified in the newdata argument or object$newdata must be non-NULL.", call. = FALSE)
  }

  # deal with local
  if (is.null(local)) {
    local <- FALSE
  }

  # write newdata if predicting missing data
  newdata <- object$data[object$missing_index, , drop = FALSE]

  # set newdata_size if needed
  if (is.null(newdata_size) && object$family == "binomial") {
    newdata_size <- rep(1, NROW(newdata))
  }

  # save spcov param vector
  spcov_params_val <- coef(object, type = "spcov")

  # save dispersion param vector
  dispersion_params_val <- as.vector(coef(object, type = "dispersion")) # remove class

  # save randcov param vector
  randcov_params_val <- coef(object, type = "randcov")



  formula_newdata <- delete.response(terms(object))
  # fix model frame bug with degree 2 basic polynomial and one prediction row
  # e.g. poly(x, y, degree = 2) and newdata has one row
  if (any(grepl("nmatrix.", attributes(formula_newdata)$dataClasses, fixed = TRUE)) && NROW(newdata) == 1) {
    newdata <- newdata[c(1, 1), , drop = FALSE]
    newdata_model_frame <- model.frame(formula_newdata, newdata, drop.unused.levels = FALSE, na.action = na.pass, xlev = object$xlevels)
    newdata_model <- model.matrix(formula_newdata, newdata_model_frame, contrasts = object$contrasts)
    newdata_model <- newdata_model[1, , drop = FALSE]
    # find offset
    offset <- model.offset(newdata_model_frame)
    if (!is.null(offset)) {
      offset <- offset[1]
    }
    newdata <- newdata[1, , drop = FALSE]
  } else {
    newdata_model_frame <- model.frame(formula_newdata, newdata, drop.unused.levels = FALSE, na.action = na.pass, xlev = object$xlevels)
    # assumes that predicted observations are not outside the factor levels
    newdata_model <- model.matrix(formula_newdata, newdata_model_frame, contrasts = object$contrasts)
    # find offset
    offset <- model.offset(newdata_model_frame)
  }
  attr_assign <- attr(newdata_model, "assign")
  attr_contrasts <- attr(newdata_model, "contrasts")
  keep_cols <- which(colnames(newdata_model) %in% colnames(model.matrix(object)))
  newdata_model <- newdata_model[, keep_cols, drop = FALSE]
  attr(newdata_model, "assign") <- attr_assign[keep_cols]
  attr(newdata_model, "contrasts") <- attr_contrasts

  # finding rows w/out NA
  # this isn't really needed, because the error should come on model building
  # but someone could accidentally write over their newdata object after fitting
  # so it is good to keep for completeness
  ob_predictors <- complete.cases(newdata_model)
  if (any(!ob_predictors)) {
    stop("Cannot have NA values in predictors.", call. = FALSE)
  }

  # call terms if needed
  if (type == "terms") {
    # scale df not used for glms
    return(predict_terms(object, newdata_model, se.fit, scale = NULL, df = Inf, interval, level, add_newdata_rows = TRUE, terms, ...))
  }


  # storing newdata as a list
  newdata_list <- split(newdata, seq_len(NROW(newdata)))

  # storing newdata as a list
  newdata_model_list <- split(newdata_model, seq_len(NROW(newdata)))

  if (interval %in% c("none", "prediction")) {

    # # randcov
    randcov_Zs_val <- get_randcov_Zs(randcov_names = names(randcov_params_val), data = object$data)
    # making the partition matrix
    partition_matrix_val <- partition_matrix(object$partition_factor, object$data)
    # making the covariance matrix
    cov_matrix_val <- cov_matrix(spcov_params_val, object$W, randcov_params_val, randcov_Zs_val, partition_matrix_val, object$M)
    # cov_matrix_val_obs <- covmatrix(object)

    # making the covariance vector
    cov_vector_val <- cov_matrix_val[object$missing_index, object$observed_index, drop = FALSE]
    # cov_vector_val <- covmatrix(object, newdata = object$newdata)

    # splitting the covariance vector
    cov_vector_val_list <- split(cov_vector_val, seq_len(NROW(cov_vector_val)))

    # # lower triangular cholesky
    cov_matrix_lowchol <- t(chol(cov_matrix_val[object$observed_index, object$observed_index, drop = FALSE]))
    # cov_matrix_lowchol <- t(chol(cov_matrix_val_obs))

    # find X observed
    X <- model.matrix(object)
    SqrtSigInv_X <- forwardsolve(cov_matrix_lowchol, X)

    # find w observed
    w <- fitted(object, type = "link")
    SqrtSigInv_w <- forwardsolve(cov_matrix_lowchol, w)

    # beta hat
    betahat <- coef(object)

    # residuals pearson
    residuals_pearson_w <- SqrtSigInv_w - SqrtSigInv_X %*% betahat

    # cov beta hat
    cov_betahat <- vcov(object, var_correct = FALSE)

    # total var
    total_var_list <- as.list(diag(cov_matrix_val[object$missing_index, object$missing_index, drop = FALSE]))

    # local prediction list (only for parallel)
    local_list <- get_local_list_prediction(local)

    # local stuff for parallel
    if (local_list$parallel) {
      cl <- parallel::makeCluster(local_list$ncores)
      cluster_list <- lapply(seq_along(newdata_model_list), function(l) {
        cluster_list_element <- list(
          x0 = newdata_model_list[[l]],
          c0 = cov_vector_val_list[[l]],
          s0 = total_var_list[[l]]
        )
      })
      pred_spautor <- parallel::parLapply(cl, cluster_list, get_pred_spgautor_parallel,
        cov_matrix_lowchol, betahat,
        residuals_pearson_w,
        cov_betahat, SqrtSigInv_X,
        se.fit = se.fit,
        interval = interval
      )
      cl <- parallel::stopCluster(cl)
    } else {
      # make predictions
      pred_spautor <- mapply(
        x0 = newdata_model_list, c0 = cov_vector_val_list, s0 = total_var_list,
        FUN = function(x0, c0, s0) {
          get_pred_spgautor(
            x0 = x0, c0 = c0, s0 = s0,
            cov_matrix_lowchol, betahat,
            residuals_pearson_w,
            cov_betahat, SqrtSigInv_X,
            se.fit = se.fit,
            interval = interval
          )
        }, SIMPLIFY = FALSE
      )
    }

    if (interval == "none") {
      fit <- vapply(pred_spautor, function(x) x$fit, numeric(1))
      # apply offset
      if (!is.null(offset)) {
        fit <- fit + offset
      }
      if (type == "response") {
        fit <- invlink(fit, object$family, newdata_size)
      }
      if (se.fit) {
        vars <- vapply(pred_spautor, function(x) x$var, numeric(1))
        if (var_correct) {
          vars_adj <- get_wts_varw(
            family = object$family,
            Xmat = model.matrix(object),
            y = object$y,
            w = fitted(object, type = "link"),
            size = object$size,
            dispersion = dispersion_params_val,
            cov_lowchol = cov_matrix_lowchol,
            x0 = newdata_model,
            c0 = cov_vector_val
          )
          vars <- vars_adj + vars
        }
        se <- sqrt(vars)
        names(fit) <- object$missing_index
        names(se) <- object$missing_index
        return(list(fit = fit, se.fit = se))
      } else {
        names(fit) <- object$missing_index
        return(fit)
      }
    }

    if (interval == "prediction") {
      fit <- vapply(pred_spautor, function(x) x$fit, numeric(1))
      # apply offset
      if (!is.null(offset)) {
        fit <- fit + offset
      }
      vars <- vapply(pred_spautor, function(x) x$var, numeric(1))
      if (var_correct) {
        vars_adj <- get_wts_varw(
          family = object$family,
          Xmat = model.matrix(object),
          y = object$y,
          w = fitted(object, type = "link"),
          size = object$size,
          dispersion = dispersion_params_val,
          cov_lowchol = cov_matrix_lowchol,
          x0 = newdata_model,
          c0 = cov_vector_val
        )
        vars <- vars_adj + vars
      }
      se <- sqrt(vars)
      # tstar <- qt(1 - (1 - level) / 2, df = object$n - object$p)
      tstar <- qnorm(1 - (1 - level) / 2)
      lwr <- fit - tstar * se
      upr <- fit + tstar * se
      if (type == "response") {
        fit <- invlink(fit, object$family, newdata_size)
        lwr <- invlink(lwr, object$family, newdata_size)
        upr <- invlink(upr, object$family, newdata_size)
      }
      fit <- cbind(fit, lwr, upr)
      row.names(fit) <- 1:NROW(fit)
      if (se.fit) {
        row.names(fit) <- object$missing_index
        names(se) <- object$missing_index
        return(list(fit = fit, se.fit = se))
      } else {
        row.names(fit) <- object$missing_index
        return(fit)
      }
    }
  } else if (interval == "confidence") {
    # finding fitted values of the mean parameters
    fit <- as.numeric(newdata_model %*% coef(object))
    # apply offset
    if (!is.null(offset)) {
      fit <- fit + offset
    }
    vars <- as.numeric(vapply(newdata_model_list, function(x) crossprod(x, vcov(object) %*% x), numeric(1)))
    se <- sqrt(vars)
    # tstar <- qt(1 - (1 - level) / 2, df = object$n - object$p)
    tstar <- qnorm(1 - (1 - level) / 2)
    lwr <- fit - tstar * se
    upr <- fit + tstar * se
    if (type == "response") {
      fit <- invlink(fit, object$family, newdata_size)
      lwr <- invlink(lwr, object$family, newdata_size)
      upr <- invlink(upr, object$family, newdata_size)
    }
    fit <- cbind(fit, lwr, upr)
    row.names(fit) <- 1:NROW(fit)
    if (se.fit) {
      row.names(fit) <- object$missing_index
      names(se) <- object$missing_index
      return(list(fit = fit, se.fit = se))
    } else {
      row.names(fit) <- object$missing_index
      return(fit)
    }
  } else {
    stop("Interval must be none, confidence, or prediction")
  }
}

get_pred_spgautor <- function(x0, c0, s0, cov_matrix_lowchol, betahat, residuals_pearson_w, cov_betahat, SqrtSigInv_X, se.fit, interval) {
  SqrtSigInv_c0 <- forwardsolve(cov_matrix_lowchol, c0)
  fit <- as.numeric(x0 %*% betahat + crossprod(SqrtSigInv_c0, residuals_pearson_w))
  if (se.fit || interval == "prediction") {
    H <- x0 - crossprod(SqrtSigInv_c0, SqrtSigInv_X)
    var <- as.numeric(s0 - crossprod(SqrtSigInv_c0, SqrtSigInv_c0) + H %*% tcrossprod(cov_betahat, H))
    pred_list <- list(fit = fit, var = var)
  } else {
    pred_list <- list(fit = fit)
  }
  pred_list
}

get_pred_spgautor_parallel <- function(cluster_list, cov_matrix_lowchol, betahat, residuals_pearson_w, cov_betahat, SqrtSigInv_X, se.fit, interval) {
  x0 <- cluster_list$x0
  c0 <- cluster_list$c0
  s0 <- cluster_list$s0
  get_pred_spgautor(x0, c0, s0, cov_matrix_lowchol, betahat, residuals_pearson_w, cov_betahat, SqrtSigInv_X, se.fit, interval)
}

#' @name predict.spmodel
#' @method predict spglm_list
#' @order 11
#' @export
predict.spglm_list <- function(object, newdata, type = c("link", "response", "terms"), se.fit = FALSE,
                               interval = c("none", "confidence", "prediction"), level = 0.95,
                               dispersion = NULL, terms = NULL, local, var_correct = TRUE, newdata_size,
                               na.action = na.fail, ...) {
  type <- match.arg(type)
  # match interval argument so the three display
  interval <- match.arg(interval)

  # deal with local
  if (missing(local)) {
    local <- NULL
  }

  # deal with newdata_size
  if (missing(newdata_size)) {
    newdata_size <- NULL
  }

  if (missing(newdata)) {
    preds <- lapply(object, function(x) {
      predict(
        x,
        type = type,
        se.fit = se.fit,
        interval = interval,
        level = level,
        dispersion = dispersion,
        terms = terms,
        local = local,
        var_correct = var_correct,
        newdata_size = newdata_size, # don't need na.action as it is fixed in predict
        ...
      )
    })
  } else {
    preds <- lapply(object, function(x) {
      predict(
        x,
        newdata = newdata,
        type = type,
        se.fit = se.fit,
        interval = interval,
        level = level,
        dispersion = dispersion,
        terms = terms,
        local = local,
        var_correct = var_correct,
        newdata_size = newdata_size, # don't need na.action as it is fixed in predict
        ...
      )
    })
  }
  names(preds) <- names(object)
  preds
}

#' @name predict.spmodel
#' @method predict spgautor_list
#' @order 12
#' @export
predict.spgautor_list <- predict.spglm_list

Try the spmodel package in your browser

Any scripts or data that you put into this service are public.

spmodel documentation built on April 4, 2025, 1:39 a.m.