#' List of paired fit/pred methods by type (glm/glmnet/gbm)
#'
#' @return a list with model types (e.g., glm/gbm), each with appropriate
#' \code{$fit} and \code{$pred} functions
#' @export
models <- function() {
list(
gbm = list(
fit = function(f, d, ...)
gbm::gbm(f, data = d, ...),
pred = function(m, d, f)
gbm::predict.gbm(m, d, gbm::gbm.perf(m, plot.it = FALSE),
type = "response")
),
glm = list(
fit = function(f, d, ...)
stats::glm(f, data = d, family = stats::quasibinomial, ...),
pred = function(m, d, f)
stats::predict.glm(m, newdata = d, type = "response")
),
lasso = list(
fit = function(f, d, ...)
undi::fit_glmnet(f, d, family = "binomial", alpha = 1, ...),
pred = function(m, d, f)
undi::predict4mm(m, d, f, type = "response", s = "lambda.min")
),
ridge = list(
fit = function(f, d, ...)
undi::fit_glmnet(f, d, family = "binomial", alpha = 0, ...),
pred = function(m, d, f)
undi::predict4mm(m, d, f, type = "response", s = "lambda.min")
),
sgd = list(
fit = function(f, d, model.control = list(family = "binomial"),
sgd.control = list(lr = 'adagrad', reltol = 1e-8, shuffle = T), ...)
undi::fit_sgd(f, d, model = "glm",
model.control = model.control,
sgd.control = sgd.control, ...),
pred = function(m, d, f)
undi::predict4mm(m, d, f, type = "response")
)
)
}
#' Given a policy and (optional) controls, generate a rad_control object
#'
#' @param pol a \code{\link{policy}} object
#' @param fit_fn string indicating the rad estimation model/procedure used.
#' \code{*_coef} methods use models without interaction between risk and
#' group, and return the coeficient on group membership. \code{*_avg} methods
#' will fit more flexible models (possibly with interactions), and compute
#' average ratios across the population. (TODO: better documentation is
#' expected)
#' @param controls character vector of additional controls to consider in the
#' second-stage model
#' @param use_speedglm whether or not to use \code{speedglm}, instead of
#' \code{stats::glm}, in cases where N > 2P (see details)
#'
#' @details speedglm can potentially speed-up computation significantly, but
#' only in cases where the number of rows is somewhat greater than the number
#' of features (specifically, when N > 2P). In terms of FLOPs at each Fisher
#' iteration, stats::glm requires (2np^2 - (2/3)p^3) FLOPS vs, (np^2 +
#' (4/3)p^3) for speedglm.
#'
#' @return a \code{rad_control} object constructed of \item{formula}{the formula
#' used in model fitting} \item{label}{a character label associated with the
#' model fit type} \item{grouping}{column name of group, as specified in
#' \code{pol$grouping}} \item{fit}{a function of the form f(d, w = NULL, ...)
#' for fitting a model with training data \code{d}} \item{pred}{function of
#' the form g(m, d) for generating predictions for data \code{d} with model
#' \code{m}} \item{method}{character string describing the method to use}
#' @export
rad_control <-
function(pol,
fit_fn = c("logit_coef", "gam_coef", "decbin_coef",
"logit_avg", "gam_avg"),
controls = NULL,
use_speedglm = TRUE) {
fit_fn <- match.arg(fit_fn)
switch (fit_fn,
logit_coef = {
feats <- c(pol$grouping, "risk__", controls)
f <- .make_formula(pol$treatment, feats)
label <- paste(feats, collapse = ", ")
ret <- list(
formula = f,
label = label,
fit = NA,
pred = function() stop("Never call pred() on a coef method for rad!"),
method = "coef"
)
# FLOPs at each iteration:
# stats::glm: 2np^2 - (2/3)p^3
# speedglm: np^2 + (4/3)p^3
# Use stats::glm only if n < 2p
if (use_speedglm && nrow(pol$data) > 2 * length(feats)) {
ret$fit <- function(d, w = NULL, ...) {
if (is.null(w)) {
speedglm::speedglm(f, d, family = stats::quasibinomial(), ...)
} else {
# Make sure that the weights exist in d at the time of call
d$w <- w
speedglm::speedglm(f, d, weights = w,
family = stats::quasibinomial(),
...)
}
}
} else {
ret$fit <- function(d, w = NULL, ...) {
if (is.null(w)) {
stats::glm(f, d, family = stats::quasibinomial(), ...)
} else {
# Make sure that the weights exist in d at the time of call
d$w <- w
stats::glm(f, d, weights = w, family = stats::quasibinomial(), ...)
}
}
}
},
gam_coef = {
feats <- c(pol$grouping, "s(risk__)", controls)
f <- .make_formula(pol$treatment, feats)
label <- paste(feats, collapse = ", ")
ret <- list(
formula = f,
label = label,
fit = function(d, w = NULL, ...) {
if (is.null(w)) {
mgcv::gam(f, data = d, family = stats::quasibinomial(), ...)
} else {
# Make sure that the weights exist in d at the time of call
d$w <- w
mgcv::gam(f, data = d, weights = w, family = stats::quasibinomial(), ...)
}
},
pred = function() stop("Never call pred() on a coef method for rad!"),
method = "coef"
)
},
decbin_coef = {
deciles <- pol$data %>%
filter(fold__ == "test") %>%
pull(risk__) %>%
stats::quantile(seq(.1, .9, .1))
feats <- c(pol$grouping, "riskbin__", controls)
f <- .make_formula(pol$treatment, feats)
label <- paste(feats, collapse = ", ")
ret <- list(
formula = f,
label = label,
fit = function(d, w = NULL, ...) {
d$riskbin__ <- cut(d$risk__, c(-Inf, deciles, Inf))
if (is.null(w)) {
mgcv::gam(f, data = d, family = stats::quasibinomial(), ...)
} else {
# Make sure that the weights exist in d at the time of call
d$w <- w
mgcv::gam(f, data = d, weights = w, family = stats::quasibinomial(), ...)
}
},
pred = function() stop("Never call pred() on a coef method for rad!"),
method = "coef"
)
if (use_speedglm && nrow(pol$data) > 2 * length(feats)) {
ret$fit <- function(d, w = NULL, ...) {
d$riskbin__ <- cut(d$risk__, c(-Inf, deciles, Inf))
if (is.null(w)) {
speedglm::speedglm(f, d, family = stats::quasibinomial(), ...)
} else {
# Make sure that the weights exist in d at the time of call
d$w <- w
speedglm::speedglm(f, d, weights = w,
family = stats::quasibinomial(),
...)
}
}
} else {
ret$fit <- function(d, w = NULL, ...) {
d$riskbin__ <- cut(d$risk__, c(-Inf, deciles, Inf))
if (is.null(w)) {
stats::glm(f, d, family = stats::quasibinomial(), ...)
} else {
# Make sure that the weights exist in d at the time of call
d$w <- w
stats::glm(f, d, weights = w, family = stats::quasibinomial(), ...)
}
}
}
},
logit_avg = {
feats <- c(pol$grouping, "risk__", controls)
f <- .make_formula(pol$treatment,
c(feats, paste0(pol$grouping, ":", "risk__")))
label <- paste(feats, collapse = ", ")
ret <- list(
formula = f,
label = label,
fit = function(d, w = NULL, ...) {
if (is.null(w)) {
stats::glm(f, d, family = stats::quasibinomial, ...)
} else {
# Make sure that the weights exist in d at the time of call
d$w <- w
stats::glm(f, d, weights = w, family = stats::quasibinomial, ...)
}
},
pred = function(m, d) {
stats::predict.glm(object = m,
newdata = d,
type = "response")
},
method = "avg"
)
},
gam_avg = {
feats <- c(pol$grouping, "gam::s(risk__)", controls)
f <- .make_formula(pol$treatment,
c(feats, paste0(pol$grouping, ":", "risk__")))
label <- paste(feats, collapse = ", ")
ret <- list(
formula = f,
label = label,
fit = function(d, w = NULL, ...) {
if (is.null(w)) {
gam::gam(f, data = d, family = stats::quasibinomial, ...)
} else {
# Make sure that the weights exist in d at the time of call
d$w <- w
gam::gam(f, data = d, weights = w, family = stats::quasibinomial, ...)
}
},
pred = function(m, d) {
gam::predict.Gam(m, d, type = "response")
},
method = "avg"
)
},
stop("Failed to construct disparate impact model functions\n",
"\t(Unknown fit_fn?)")
)
ret$grouping <- pol$grouping
class(ret) <- c("rad_control", class(ret))
return(ret)
}
#' Wrapper for fitting \code{\link[sgd]{sgd}} models with standardized design
#' matrix
#'
#' @param f formula
#' @param d data object
#' @param ... additional arguments for \code{\link[sgd]{sgd}}
#'
#' @return an \code{sgd} object
#' @export
fit_sgd <- function(f, d, ...) {
dm <- .get_mm(f, d)
Y <- dm$Y
x <- dm$x
x <- .safe_scale(x)
m <- sgd::sgd(x, Y, ...)
m$scale = attr(x, "scaled:scale")
m$center = attr(x, "scaled:center")
m$formula = f
m$coefficients[1] <- m$coefficients[1] -
sum(m$center[-1]*m$coefficients[-1]/m$scale[-1])
m$coefficients[-1] <-
m$coefficients[-1]/m$scale[-1]
m$coefficients <- stats::setNames(m$coefficients[,1], colnames(x))
return(m)
}
#' Wrapper for fitting \code{\link[glmnet]{glmnet}} models from a formula
#'
#' @param f formula
#' @param d data object
#' @param ... additional arguments for \code{\link[glmnet]{cv.glmnet}}
#'
#' @return an \code{\link[glmnet]{cv.glmnet}} object
#' @export
fit_glmnet <- function(f, d, ...) {
dm <- .get_mm(f, d)
Y <- dm$Y
x <- dm$x
m <- glmnet::cv.glmnet(x, Y, ...)
return(m)
}
#' Wrapper to compute predictions from a model matrix
#'
#' @param m model object
#' @param d data frame to generate predictions on
#' @param f model formula; used to transform \code{d} to a \code{design matrix}
#' @param ... other arguments to pass to the predict function
#'
#' @return generated predictions
#' @export
predict4mm <- function(m, d, f, ...) {
mm <- stats::model.matrix(f, d)
preds <- stats::predict(m, mm, ...)
if (is.null(dim(preds))) {
return(preds)
} else if (length(dim(preds)) == 1) {
dim(preds) <- NULL
return(preds)
} else if (length(dim(preds)) == 2 & dim(preds)[2] == 1) {
dim(preds) <- NULL
return(preds)
} else {
stop("Predictions have unexpected dimensions for model class: ", class(m))
}
}
#' Get model matrix and response given a formula and data frame
#'
#' @param f model formula
#' @param d data frame
#'
#' @return a list with \code{Y} and \code{x}, the response and design matrix,
#' respectively
.get_mm <- function(f, d) {
mf <- stats::model.frame(f, d)
# Extract Y values
Y <- stats::model.response(mf, "any")
if (length(dim(Y)) == 1L) {
nm <- rownames(Y)
dim(Y) <- NULL
if (!is.null(nm)) {
names(Y) <- nm
}
}
# Build and standardize model matrix x
mt <- attr(mf, "terms")
if (!stats::is.empty.model(mt)) {
x <- stats::model.matrix(mt, mf)
} else {
stop("No valid terms found in data for specified formula: ", format(f))
}
list(Y = Y, x = x)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.