Nothing
#' Goodness-of-fit test for high-dimensional generalized linear models
#'
#' The function can test goodness-of-fit of a low- or high-dimensional
#' generalized linear model (GLM) by detecting the presence of nonlinearity in
#' the conditional mean function of y given X. Outputs a p-value.
#'
#' @param X A matrix or a data frame with \code{n} rows. In case of
#' a data frame, each column may be a numerical vector or a factor.
#' @param y Response vector with \code{n} entries. (If \code{fam=="binomial"}, \code{y} may be a numerical vector of 0s and 1s or a factor with two levels).
#' @param fam Must be "gaussian", "binomial" or "poisson".
#' @param RP_function (optional) User specified function for residual prediction (see Details below).
#' @param penalize \code{TRUE} if penalization should be used when fitting the GLM models (see Details below).
#' @param nsplits Number of splits of the data set (see Details below).
#' @param output_all If \code{TRUE}, outputs all p-values from \code{nspilts} splits of the data.
#' @details
#' This function tests if the conditional mean of \code{y} given \code{X} could be
#' originating from a GLM family specified by the user via \code{fam}.
#'
#' The function works by splitting the data into parts A and B,
#' and computes a GLM fit on both parts.
#' If \code{penalize == TRUE}, these fits use \code{cv.glmnet} from package \code{glmnet},
#' otherwise they use \code{glmnet} with penalty set to 0. If
#' \code{RP_function} (optional) is not supplied by
#' the user, \code{randomForest} is used to predict remaining signal from the residuals from
#' GLM fit on part A. The test statistic is proportional to the dot product
#' between the random forest prediction and residuals from GLM fit on part B.
#' If \code{nsplits} is greater than one, the above procedure is repeated \code{nsplits}
#' times and the resulting p-values are aggregated using the approach from Meinshausen at al. (2012)
#'
#' A user may supply their own residual prediction function to replace random forest via
#' parameter \code{RP_function} (see Examples for use). The function must take as arguments
#' an input matrix \code{XA}, vector \code{resA} (with length \code{nrow(XA)}) and matrix \code{XB}. Its
#' role is to regress \code{resA} on input matrix \code{XA} with a preferred residual
#' prediction method and output a vector with dimensions \code{nrow(XB)}
#' that contains predictions of this fit on input \code{XB}.
#' @return If \code{output_all = FALSE}, the function outputs a single p-value.
#' Otherwise it returns a list containing the aggregated p-value in \code{pval} and
#' a vector of p-values from all splits in \code{pvals}.
#' @references
#' Janková, J., Shah, R. D., Bühlmann, P. and Samworth, R. (2019)
#' \emph{Goodness-of-fit testing in high-dimensional generalized linear models}
#' \url{https://arxiv.org/abs/1908.03606}
#' Meinshausen, N., Meier, L. and Bühlmann, P. (2012)
#' \emph{p-Values for High-Dimensional Regression}
#' Journal of the American Statistical Association, 104:488, 1671-1681
#' @examples
#' # Testing for nonlinearity: Logistic link function
#'
#' set.seed(1)
#' X <- matrix(rnorm(300*30), 300, 30)
#' z <- X[, 1] + X[, 2]^4
#' pr <- 1/(1 + exp(-z))
#' y <- rbinom(nrow(X), 1, pr)
#' (out <- GRPtest(X, y, fam = "binomial", nsplits = 5))
#'
#' # Testing for nonlinearity: Define your own RP function
#' # use package xyz
#'
#' my_RP_function <- function(XA, resA, XB){
#' xyz_fit <- xyz_regression(XA, resA)
#' predict(xyz_fit, newdata = as.matrix(XB))[,5]
#' }
#'
#' library(xyz)
#' set.seed(2)
#' X <- matrix(rnorm(500*30), 500, 30)
#' z <- X[,1:3]%*%rep(1,3) + 1*X[, 1]*X[,5]
#' mu <- exp(z)
#' y <- rpois(n = nrow(X), lambda = mu)
#' (out <- GRPtest(X, y, fam = "poisson", RP_function = my_RP_function))
#'
#'\dontrun{
#' # An example with factors (labelled as "Not run" due to running time > 10s)
#'
#' set.seed(1)
#' n <- 2021
#' X1 <- sample(c("A","B","C"), n, replace = TRUE)
#' X1 <- factor(X1, levels = c("A", "B", "C"))
#' X2 <- sample(c("Male","Female"), n, replace = TRUE)
#' X2 <- factor(X2, levels = c("Male", "Female"))
#' X3 <- rnorm(n)
#' X <- data.frame(X1, X2, X3)
#'
#' # Generate response y1 using a logistic regression model
#' prob1 <- 1 / (1 + exp( - (X1 == "B") + 2*(X1 == "C") - 2*(X2 == "Male") - X3) )
#' y1 <- rbinom(n, 1, prob1)
#'
#' # Output p-value for goodness of fit of the logistic regression model
#' (out <- GRPtest(X, y1, fam = "binomial", nsplits = 10))
#'
#' # Generate response y2 using a logistic regression model but with an interaction between X1 and X2
#' prob2 <- 1 / (1 + exp( - (X1 == "B") + 2*(X1 == "C") + 2*(X1 == "B")*(X2 == "Male") - X3) )
#' y2 <- rbinom(n, 1, prob2)
#'
#' # Test goodness of fit of the logistic regression model
#' (out <- GRPtest(X, y2, fam = "binomial", nsplits = 10))
#'}
#'
#' @export
#' @import stats
#' @import randomForest
#' @import glmnet
#' @import RPtests
#' @importFrom ranger ranger
#'
GRPtest <- function(X, y, fam = c("gaussian", "binomial", "poisson"),
RP_function = NULL,
nsplits = 5L, penalize = ifelse(p >= floor(n/1000), TRUE, FALSE), output_all = FALSE){
if (!is.data.frame(X) && !is.matrix(X)){
stop("X should be a matrix or a data frame.")
}
if(is.data.frame(X)){
Z <- model.matrix(~., data = X)
}else{
Z <- X
}
np <- dim(X)
if (is.null(np) | (np[2] < 1L)){
stop("X should be a matrix or a data frame with at least one column.")
}
n <- as.integer(np[1])
p <- as.integer(np[2])
if(fam == "binomial"){
if(is.factor(y)){
y <- as.numeric(y) - 1 # convert to numerical vector of 0s and 1s
}
}
if (length(y) != n){
stop("y must have nrow(X) components.")
}
if ((p >= n-1) && (penalize == FALSE)){
stop("When penalize=FALSE we must have ncol(X) < nrow(X)-1; try setting penalize=TRUE.")
}
if(fam != "gaussian" && fam != "binomial" && fam != "poisson"){
stop("family must be gaussian, binomial or poisson; try changing parameter fam.")
}
if (is.null(RP_function)) {
RP_function <- RP_randomForest
}
output <- nonlin_test(Z, y, fam, nsplits, RP_function, penalize)
pval_computed <- output$pval
if (output_all == FALSE){
return(pval_computed)
}else{
return(output)
}
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.