R/add_probs_glm.R

Defines functions sim_probs_other probs_logistic probs_gaussian add_probs.glm

Documented in add_probs.glm

# Copyright (C) 2017 Institute for Defense Analyses
#
# This file is part of ciTools.
#
# ciTools is free software: you can redistribute it and/or modify it
# under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# ciTools is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with ciTools. If not, see <http://www.gnu.org/licenses/>.

#' Response Probabilities for Generalized Linear Models
#'
#' This is the method \code{add_probs} uses if the model fit is an
#' object of class \code{glm}. Probabilities are determined through
#' simulation, using the same method as \code{add_pi.glm}. Currently,
#' only logistic, Poisson, Quasipoisson, and Gamma models are
#' supported.
#'
#' Any of the five comparisons may be made for a Poisson,
#' quasipoisson, or Gamma model: \code{comparison = "<"}, \code{">"},
#' \code{"="}, \code{"<="}, or \code{">="}. For logistic regression,
#' the comparison statement must be equivalent to \eqn{Pr(Y|x = 0)} or
#' \eqn{Pr(Y|x = 1)}.
#'
#' If \code{add_probs} is called on a Poisson, quasiPoisson or Gamma
#' model, a simulation is performed using \code{arm::sim}.
#'
#' If \code{add_probs} is called on a logistic model, the fitted
#' probabilities are used directly (no simulation is required).
#'
#' If \code{add_probs} is called on a Gaussian GLM, the returned
#' probabilities are identical to those given by
#' \code{add_probs.lm}. In this case, the comparisons \code{<} and
#' \code{<=} are identical (likewise for \code{>} and \code{>=}). If
#' the comparison \code{=} is used in the Gaussian GLM, an informative
#' error is returned.
#'
#' @param df A data frame of new data.
#' @param fit An object of class \code{glm}. Predictions are made with
#'     this object.
#' @param q A double. A quantile of the response distribution.
#' @param name \code{NULL} or a string. If \code{NULL}, probabilities
#'     automatically will be named by \code{add_probs()}, otherwise,
#'     the probabilities will be named \code{name} in the returned
#'     dataframe.
#' @param yhatName A string. Name of the vector of predictions.
#' @param comparison A character vector of length one. If
#'     \code{comparison = "<"}, then \eqn{Pr(Y|X < q)} is
#'     calculated. Any comparison is allowed in Poisson regression,
#'     but only certain comparisons may be made in Logistic
#'     regression. See the Details section.
#' @param nSims A positive integer. Controls the number of simulated
#'     draws to make if the model is Poisson.
#' @param ... Additional arguments.
#'
#' @return A dataframe, \code{df}, with predicted values and
#'     probabilities attached.
#'
#' @seealso \code{\link{add_ci.glm}} for confidence intervals for
#'     \code{glm} objects, \code{\link{add_pi.glm}} for prediction
#'     intervals of \code{glm} objects, and
#'     \code{\link{add_quantile.glm}} for response quantiles of
#'     \code{glm} objects.
#'
#' @examples
#' # Fit a Poisson model
#' fit <- glm(dist ~ speed, data = cars, family = "poisson")
#'
#' # Determine the probability that a new dist is less than 20, given
#' # the Poisson model.
#' add_probs(cars, fit, q = 20)
#'
#' # Determine the probability that a new dist is greater than 20,
#' # given the Poisson model.
#' add_probs(cars, fit, q = 30, comparison = ">")
#'
#' # Determine the probability that a new dist is greater than or
#' # equal to 20, given the Poisson model.
#' add_probs(cars, fit, q = 30, comparison = ">=")
#'
#' # Fit a logistic model
#' fit2 <- glm(I(dist > 30) ~ speed, data = cars, family = "binomial")
#' add_probs(cars, fit2, q = 0, comparison = "=")
#' add_probs(cars, fit2, q = 1, comparison = "=")
#'
#' @export
add_probs.glm <- function(df, fit, q, name = NULL, yhatName = "pred",
                          comparison = "<", nSims = 2000, ...){

    if (is.null(name) & (comparison == "<"))
        name <- paste("prob_less_than", q, sep="")
    if (is.null(name) & (comparison == ">"))
        name <- paste("prob_greater_than", q, sep="")
    if (is.null(name) & (comparison == "<="))
        name <- paste("prob_less_than_or_equal_to", q, sep="")
    if (is.null(name) & (comparison == ">="))
        name <- paste("prob_greater_than_or_equal_to", q, sep="")
    if (is.null(name) & (comparison == "="))
        name <- paste("prob_equal_to", q, sep="")

    if ((name %in% colnames(df)))
        warning ("These probabilities may have already been appended to your dataframe. Overwriting.")
    if (fit$family$family %in% c("poisson", "quasipoisson"))
        warning("The response is not continuous, so estimated probabilities are only approximate")

    if (fit$family$family == "binomial"){
        if(max(fit$prior.weights) == 1){ #distinguish between Bernoulli and binomial regression
            warning("Equivalent to Pr(Y = 0) (or Pr(Y = 1) if comparison = '>' is specified)")
            probs_logistic(df, fit, q, name, yhatName, comparison)

        } else {
            warning("Treating weights as indicating the number of trials for a
                  binomial regression where the response is the proportion of successes")
            sim_probs_other(df, fit, q, name, yhatName, nSims, comparison)
        }
    } else if (fit$family$family == "gaussian"){
        probs_gaussian(df, fit, q, name, yhatName, comparison)
    } else if (fit$family$family %in% c("poisson", "qausipoisson", "Gamma")){
        sim_probs_other(df, fit, q, name, yhatName, nSims, comparison)
    } else {
        stop("Unsupported family")
    }
}

probs_gaussian <- function(df, fit, q, name, yhatName, comparison){
    sigma_sq <- summary(fit)$dispersion
    inverselink <- fit$family$linkinv
    out <- predict(fit, newdata = df, se.fit = TRUE)
    se_terms <- out$se.fit
    se_global <- sqrt(sigma_sq + se_terms^2)
    t_quantile <- (q - inverselink(out$fit)) / se_global

    if (comparison %in% c("<", "<="))
        t_prob <- pt(q = t_quantile, df = fit$df.residual)
    if (comparison %in% c(">", ">="))
        t_prob <- 1 - pt(q = t_quantile, df = fit$df.residual)
    if (comparison == "=")
        stop("The response is assumed to be continuous -- the probability of this event is 0")
    if(is.null(df[[yhatName]]))
        df[[yhatName]] <- inverselink(out$fit)
    df[[name]] <- t_prob
    data.frame(df)
}

probs_logistic <- function(df, fit, q, name, yhatName, comparison){
    out <- predict(fit, newdata = df, type = "response")
    if (((comparison == "=") && (q == 0)) || ((comparison == "<") && (q < 1) && (q > 0)))
        probs <- 1 - out
    else
        probs <- out
    if(is.null(df[[yhatName]]))
        df[[yhatName]] <- out
    df[[name]] <- probs
    data.frame(df)
}

sim_probs_other <- function(df, fit, q, name, yhatName, nSims, comparison){

    out <- predict(fit, newdata = df, type = "response")
    sim_response <- get_sim_response(df = df, fit = fit, nSims = nSims)

    probs <- apply(sim_response, 1, FUN = calc_prob, quant = q, comparison = comparison)

    if(fit$family$family == "binomial"){
        out <- out * fit$prior.weights
        warning("For binomial models, add_probs's column of fitted values refelct E(Y|X) rather than typical default for logistic regression, pHat")
    }
    if(is.null(df[[yhatName]]))
        df[[yhatName]] <- out
    df[[name]] <- probs
    data.frame(df)

}
jthaman/ciTools documentation built on Nov. 11, 2023, 2:04 p.m.