R/snif.R

Defines functions snif snif.init summary.snif predict.snif print.snif

Documented in predict.snif print.snif snif summary.snif

#' Selection of nonlinear interactions by a forward stepwise algorithm
#'
#' \code{snif} is a greedy forward stepwise selection algorithm that takes
#' nonlinear effects and interactions into account when adding variables.
#' \code{snif} follows the strong hereditary principle (for X1:X2 to be
#' included as an interaction, both X1 and X2 both must be already included in
#' the model. \code{snif} supports both binary and continuous outcomes.
#'
#' \code{snif} uses a formula to initialize the model before the forward
#' stepwise stage of the algorithm. If you are not interested in using a
#' non-null initial model, \code{y ~ NULL} should do just fine. On the other
#' hand, if you do wish to use an initial model, some care must be taken so
#' \code{snif} will be happy.
#'
#' Basically, there are four valid ways to build snif formulas:
#' \enumerate{
#'     \item Add a linear main effect, as in \code{y ~ V2}
#'     \item Add a nonlinear main effect, as in \code{y ~ bs(V2)[,-1]}. Note
#'         that \code{[, -1]} is important.
#'   \item Add a linear and nonlinear main effect, as in \code{y ~ bs(V2)}.
#'       \code{snif} will automatically split a \code{bs} expansion without an
#'       index (\code{[ ,-1]}) into linear and nonlinear parts internally.
#'   \item Add an interaction term, as in \code{y + V2:V3} You may specify
#'       interactions between in any combination of 1-3. If you chose to include
#'       an interaction of type 3, note that \code{snif} will internally
#'       decompose the interaction into parts. For example \code{y ~ bs(V2):V3}
#'       is equivalent to \code{y ~ V2:V3 + bs(V2)[,-1]:V3}
#'}
#' \code{snif} formulas can be built out of any linear combination 1-4.
#' @param formula An initial formula for \code{snif} to start with.
#'   \code{formula} supports linear variables, basis spline expansions via
#'   \code{bs}, and interaction terms via \code{:}. It is highly recommended to
#'   read the details section of the documentation before trying anything fancy
#'   with \code{formula}
#' @param df A data.frame containing the data
#' @param type The type of regression to perform. Either "linear" (default) or
#'   "logistic"
#' @param method A character argument that specifies the kind of scoring method
#'   used to determine which variable to add to the model. supported options
#'   are "BIC" (default), "AIC", and "PV" (p.value).
#' @param degree Degree of basis spline expansion for nonlinear effects
#' @param maxnv Max number of variables to add. Default is \code{ncol(df)}
#' @param main.only Character vector of variables that are only considered for
#'   main effects
#' @param linear.only Character vector of variables that are considered to
#'   have only linear effects
#' @references
#'  Narisetty, Naveen N. and Mukherjee, Bhramar and Chen, Yin-Hsiu and Gonzalez,
#'  Richard and Meeker, John D. Selection of nonlinear interactions by a
#'  forward stepwise algorithm: Application to identifying environmental
#'  chemical mixtures affecting health outcomes. Statistics in Medicine.
#'  2019;38(9):1582-1600. \href{https://doi.org/10.1002/sim.8059}{10.1002/sim.8059}
#' @author Alexander Rix
#' @examples
#' library(snif)
#'
#' # snif contains a synthetic data.frame, snif.df
#' # if you not want to include any covariates in the initial model, you can
#' # just use NULL of the right hand side of formula
#' snif.out <- snif(formula = y ~ NULL, df = snif.df, type = "linear",
#'                      method = "BIC")
#'
#' # snif provides a summary method which will extract and summarise the best
#' # scoring subset
#' summary(snif.out)
#'
#' # you can specify an initial model by using the rules explained in the
#' # details section.
#' snif.out <- snif(formula = y ~ V2 + V2:V4, df = snif.df, type = "linear",
#'                      method = "BIC")
#'
#' # snif also supports binary outcomes. First, we copy snif.df and discretize y
#' snif.df.bin <- snif.df
#' snif.df.bin$y <- ifelse(snif.df.bin$y > 5, 1, 0)

#' snif.out <- snif(formula = y ~ NULL, df = snif.df.bin, type = "logistic",
#'                      method = "BIC")
#' summary(snif.out)
#' @importFrom splines bs
#' @importFrom rlang f_lhs f_rhs expr sym expr_text
#' @export
snif <- function(formula, df, type = "linear", method = "BIC", degree = 3,
                   maxnv = ncol(df), main.only = NULL, linear.only = NULL)
{
    # snif.init performs all the input checking and generates the initial
    # parameters for the forward selction stage.
    initial <- snif.init(formula, df, type, method, degree, maxnv, main.only,
                             linear.only)

    # score model is BIC, AIC, or PV
    score.model <- initial$score.model
    # model is glm or lm depending on type
    model       <- initial$model

    # 'sel' is the selected linear (lin), nonlinear (nl) and interaction (int)
    # terms as specified by 'formula'. 'can' is the candidate sets for lin, nl,
    # and int. 'can' takes into account 'main.only', 'linear only', and 'sel'
    sel <- initial$sel
    can <- initial$can

    add.inter <- function(int.can, selected, term)
    {
        int <- make_interaction(selected, term)

        if (all.vars(selected) == all.vars(term))
            int.can
        else if (any(c(all.vars(selected), all.vars(term)) %in% main.only))
            int.can
        else if (expr_text(int) %in% purrr::map_chr(sel$int, expr_text))
            int.can
        else
            c(int.can, int)
    }

    f <- formula
    score.candidate <- function(candidate)
    {
        f1 <- expr(!!f_lhs(f) ~ !!f_rhs(f) + !!candidate)
        score.model(x = model(f1, df), x0 = model(f, df))
    }

    output <- structure(
                  list(formula = f, score = score.model(model(f, df)), nv = 0,
                      df = df, model = model, method = method),
                  class = "snif"
    )

    # add the term that, when included in the current model, has the best score
    for (nv in 1:maxnv) {
        scores <- list(lin = c(purrr::map_dbl(can$lin, score.candidate), Inf),
                       nl  = c(purrr::map_dbl(can$nl, score.candidate), Inf),
                       int = c(purrr::map_dbl(can$int, score.candidate), Inf)
        )

        mins <- purrr::map_dbl(scores, min)

        # i is 1, 2, or 3 which correpsonds to lin, nl, and int
        i  <- which.min(mins)

        # j is the best scoring candidate
        j  <- which.min(scores[[i]])

        prev.sel <- c(sel$lin, sel$nl)
        term <- can[[i]][[j]]

        sel[[i]] <- c(term, sel[[i]])
        can[[i]] <- can[[i]][-j]

        if (i < 3)
            can$int <- purrr::reduce(prev.sel, add.inter, term = term,
                                         .init = can$int)

        if (is.null(f_rhs(f)))
            f <- expr(!!f_lhs(f) ~ !!term)
        else
            f <- expr(!!f_lhs(f) ~ !!f_rhs(f) + !!term)

        output$score   <- c(output$score, score.model(model(f, df)))
        output$nv      <- c(output$nv, nv)
        output$formula <- c(output$formula, f)
    }

    return(output)
}

snif.init <- function(f, df, type, method, degree, maxnv, main.only, linear.only)
{
    if (!rlang::is_formula(f))
        stop("'formula' must be a formula.")

    if (!is.data.frame(df))
        stop("'df' must be a data.frame.")

    method <- match.arg(method, c("BIC", "AIC", "PV"))
    type  <- match.arg(type, c("linear", "logistic"))

    if (!is.numeric(degree) || degree < 2)
        stop("'degree' must be an integer >= 2.")

    if (!is.numeric(maxnv) || maxnv < 1)
        stop("'maxnv' must be an integer > 0.")

    if (!is.null(main.only) && !is.character(main.only))
        stop("'main.only' must be a character vector.")

    if (!is.null(linear.only) && !is.character(linear.only))
        stop("'linear.only' must be a character vector.")

    if (length(setdiff(main.only, names(df))) > 0)
        stop("'main.only' contains variables that are not in 'df'.")

    if (length(setdiff(linear.only, names(df))) > 0)
        stop("'main.only' contains variables that are not in 'df'.")

    y <- deparse(f_lhs(f))

    if (type == "logistic" && length(unique(df[[y]])) != 2)
        stop(paste(y, "is not a binary variable."))

    if (type == "logistic")
        model <- function(f, df) stats::glm(f, df, family = stats::binomial)
    else
        model <- stats::lm

    if (method == "BIC")
        score.model <- function(x, x0) stats::BIC(x)
    else if (method == "AIC")
        score.model <- function(x, x0) stats::AIC(x)
    else if (type == "linear") {
        score.model <- function(x, x0)
        {
            if (missing(x0)) {
                fstat <- summary(x)$fstatistic
                if (is.null(fstat))
                    1
                else
                    eval(expr(stats::pf(!!!unname(fstat), lower.tail = F)))
            }
            else
                stats::anova(x0, x)$`Pr(>F)`[2]
        }
    }
    else {
        score.model <- function(x, x0)
        {
            if (missing(x0))
                stats::pchisq(x$null.deviance - x$deviance, x$df.null -
                                  x$df.residual, lower.tail = F)
            else
                stats::pchisq(x0$deviance - x$deviance, x0$df.residual -
                                  x$df.residual, lower.tail = F)
        }
    }
    parsed <- parse_formula(f, degree)
    sel    <- list(lin = parsed$lin, nl = parsed$nl, int = parsed$int)

    can <- NULL

    vars    <- setdiff(names(df), y)
    can$lin <- setdiff(purrr::map(vars, ~ sym(.x)), sel$lin)

    vars   <- setdiff(vars, linear.only)
    can$nl <- setdiff(purrr::map(vars, function(var)
        expr(bs(!!sym(var), degree = !!degree)[, -1])), sel$nl)

    add.inter <- function(int.can, selected, term)
    {
        int <- make_interaction(selected, term)

        if (all.vars(selected) == all.vars(term))
            int.can
        else if (any(c(all.vars(selected), all.vars(term)) %in% main.only))
            int.can
        else if (any(int == sel$int))
            int.can
        else
            c(int.can, int)
    }

    can$int <- purrr::map(c(sel$lin, sel$nl), function(term)
        purrr::reduce(c(sel$lin, sel$nl), add.inter, term = term,
                          .init = list()))

    can$int <- unique(rlang::squash(can$int))

    list(model = model, score.model = score.model, sel = sel, can = can)
}

#' Summarizing SNIF Fits
#'
#' \code{summary} method for class "snif"
#'
#' \code{summary.snif} extracts the best scoring model from \code{object} and
#' returns the linear model summary.
#' @param object An object of type "snif"
#' @param ... Additional arguments to \code{summary.lm} or \code{summary.glm},
#'     depending on the type of model fit
#' @export
summary.snif <- function(object, ...)
{
    if (class(object) != "snif")
        stop("'object' is not a 'snif' object.")

    i <- which.min(object$score)
    formula <- object$formula[[i]]

    list(
        formula = object$formula[[i]],
        score = object$score[i],
        summary = summary(object$model(formula, object$df), ...)
    )
}

#' Predict Using The Best SNIF Fit
#'
#' \code{predict} method for class "snif"
#'
#' \code{predict.snif} extracts the best scoring model from \code{object} and
#' returns the predictions from \code{predict.lm}.
#' @param object An object of type "snif"
#' @param newdata An optional data frame in which to look for variables with
#'   which to predict.  If omitted, the original values are used.
#' @param ... Additional arguments to \code{predict.lm} or \code{predict.glm},
#' depending on the type of model fit
#' @export
predict.snif <- function(object, newdata, ...)
{
    if (class(object) != "snif")
        stop("'object' is not a 'snif' object.")

    i <- which.min(object$score)
    formula <- object$formula[[i]]
    df <- object$df
    stats::predict(object$model(formula, df), newdata, ...)
}

#' Print \code{snif} objects
#'
#' \code{print} method for class "snip"
#' @param x An object of type "snif"
#' @param ... Additional parameters to pass onto \code{print}
#' @export
print.snif <- function(x, ...)
{
    if (class(x) != "snif")
        stop("'x' is not a snif object.")

    if (isTRUE(invisible(all.equal(x$model, stats::lm))))
        cat("snif run with type = 'linear'.\n")
    else
        cat("snif run with type = 'logisitic'.\n")

    for (i in 1:length(x$formula))
        cat(paste("Score:", signif(x$score[i]), "model:", expr_text(x$formula[[i]]), "\n"))
}
umich-cphds/snif documentation built on Nov. 5, 2019, 11:05 a.m.