Nothing
#' Firth's Bias-Reduced Logistic Regression
#'
#' Implements Firth's bias-Reduced penalized-likelihood logistic regression.
#'
#' \code{logistf} is the main function of the package. It fits a logistic regression
#' model applying Firth's correction to the likelihood. The following generic methods are available for logistf's output
#' object: \code{print, summary, coef, vcov, confint, anova, extractAIC, add1, drop1,
#' profile, terms, nobs, predict}. Furthermore, forward and backward functions perform convenient variable selection. Note
#' that anova, extractAIC, add1, drop1, forward and backward are based on penalized likelihood
#' ratios.
#'
#' @param formula A formula object, with the response on the left of the operator,
#' and the model terms on the right. The response must be a vector with 0 and 1 or \code{FALSE} and
#' \code{TRUE} for the outcome, where the higher value (1 or \code{TRUE}) is modeled. It is
#' possible to include contrasts, interactions, nested effects, cubic or polynomial
#' splines and all S features as well, e.g. Y ~ X1*X2 + ns(X3, df=4).
#' @param data A data.frame where the variables named in the formula can be found,
#' i. e. the variables containing the binary response and the covariates.
#' @param pl Specifies if confidence intervals and tests should be based on the profile
#' penalized log likelihood (\code{pl=TRUE}, the default) or on the Wald method (\code{pl=FALSE}).
#' @param alpha The significance level (1-\eqn{\alpha} the confidence level, 0.05 as default).
#' @param control Controls iteration parameter. Default is \code{control= logistf.control()}
#' @param plcontrol Controls Newton-Raphson iteration for the estimation of the profile
#' likelihood confidence intervals. Default is \code{plcontrol= logistpl.control()}
#' @param modcontrol Controls additional parameter for fitting. Default is \code{logistf.mod.control()}
#' @param firth Use of Firth's penalized maximum likelihood (\code{firth=TRUE}, default) or the
#' standard maximum likelihood method (\code{firth=FALSE}) for the logistic regression.
#' Note that by specifying \code{pl=TRUE} and \code{firth=FALSE} (and probably a lower number
#' of iterations) one obtains profile likelihood confidence intervals for maximum likelihood
#' logistic regression parameters.
#' @param init Specifies the initial values of the coefficients for the fitting algorithm
#' @param weights specifies case weights. Each line of the input data set is multiplied
#' by the corresponding element of weights
#' @param na.action a function which indicates what should happen when the data contain NAs
#' @param offset a priori known component to be included in the linear predictor
#' @param plconf specifies the variables (as vector of their indices) for which profile likelihood
#' confidence intervals should be computed. Default is to compute for all variables.
#' @param flic If \code{TRUE}, intercept is altered such that the predicted probabilities become unbiased while
#' keeping all other coefficients constant (see Puhr et al, 2017)
#' @param model If TRUE the corresponding components of the fit are returned.
#' @param ... Further arguments to be passed to \code{logistf}
#'
#' @return The object returned is of the class \code{logistf} and has the following attributes:
#' \item{coefficients}{the coefficients of the parameter in the fitted model.}
#' \item{alpha}{the significance level (1- the confidence level) as specified in the input.}
#' \item{terms}{the column names of the design matrix}
#' \item{var}{the variance-covariance-matrix of the parameters.}
#' \item{df}{the number of degrees of freedom in the model.}
#' \item{loglik}{a vector of the (penalized) log-likelihood of the restricted and the full models.}
#' \item{iter}{A vector of the number of iterations needed in the fitting process for the null and full model.}
#' \item{n}{the number of observations.}
#' \item{y}{the response-vector, i. e. 1 for successes (events) and 0 for failures.}
#' \item{formula}{the formula object.}
#' \item{call}{the call object.}
#' \item{terms}{the model terms (column names of design matrix).}
#' \item{linear.predictors}{ a vector with the linear predictor of each observation.}
#' \item{predict}{a vector with the predicted probability of each observation.}
#' \item{hat.diag}{a vector with the diagonal elements of the Hat Matrix.}
#' \item{conv}{the convergence status at last iteration: a vector of length 3 with elements: last change in log likelihood, max(abs(score vector)), max change in beta at last iteration.}
#' \item{method}{depending on the fitting method 'Penalized ML' or `Standard ML'.}
#' \item{method.ci}{the method in calculating the confidence intervals, i.e. `profile likelihood' or `Wald', depending on the argument pl and plconf.}
#' \item{ci.lower}{the lower confidence limits of the parameter.}
#' \item{ci.upper}{the upper confidence limits of the parameter.}
#' \item{prob}{ the p-values of the specific parameters.}
#' \item{pl.iter}{only if pl==TRUE: the number of iterations needed for each confidence limit.}
#' \item{betahist}{only if pl==TRUE: the complete history of beta estimates for each confidence limit.}
#' \item{pl.conv}{only if pl==TRUE: the convergence status (deviation of log likelihood from target value, last maximum change in beta) for each confidence limit.}
#' \item{control}{a copy of the control parameters.}
#' \item{modcontrol}{a copy of the modcontrol parameters.}
#' \item{flic}{logical, is TRUE if intercept was altered such that the predicted probabilities become unbiased while
#' keeping all other coefficients constant. According to input of logistf.}
#' \item{model}{if requested (the default), the model frame used.}
#' \item{na.action}{information returned by model.frame on the special handling of NAs}
#'
#' @export
#'
#' @encoding UTF-8
#' @examples
#' data(sex2)
#' fit<-logistf(case ~ age+oc+vic+vicl+vis+dia, data=sex2)
#' summary(fit)
#' nobs(fit)
#' drop1(fit)
#' plot(profile(fit,variable="dia"))
#' extractAIC(fit)
#'
#' fit1<-update(fit, case ~ age+oc+vic+vicl+vis)
#' extractAIC(fit1)
#' anova(fit,fit1)
#'
#' data(sexagg)
#' fit2<-logistf(case ~ age+oc+vic+vicl+vis+dia, data=sexagg, weights=COUNT)
#' summary(fit2)
#'
#' # simulated SNP example
#' set.seed(72341)
#' snpdata<-rbind(
#' matrix(rbinom(2000,2,runif(2000)*0.3),100,20),
#' matrix(rbinom(2000,2,runif(2000)*0.5),100,20))
#' colnames(snpdata)<-paste("SNP",1:20,"_",sep="")
#' snpdata<-as.data.frame(snpdata)
#' snpdata$case<-c(rep(0,100),rep(1,100))
#'
#' fitsnp<-logistf(data=snpdata, formula=case~1, pl=FALSE)
#' add1(fitsnp, scope=paste("SNP",1:20,"_",sep=""), data=snpdata)
#' fitf<-forward(fitsnp, scope = paste("SNP",1:20,"_",sep=""), data=snpdata)
#' fitf
#'
#' @author Georg Heinze and Meinhard Ploner
#' @references Firth D (1993). Bias reduction of maximum likelihood estimates. Biometrika 80, 27-38.
#' Heinze G, Schemper M (2002). A solution to the problem of separation in logistic regression.
#' Statistics in Medicine 21: 2409-2419.
#'
#' Heinze G, Ploner M (2003). Fixing the nonconvergence bug in logistic regression with SPLUS and
#' SAS. Computer Methods and Programs in Biomedicine 71: 181-187.
#'
#' Heinze G, Ploner M (2004). Technical Report 2/2004: A SAS-macro, S-PLUS library and R
#' package to perform logistic regression without convergence problems. Section of Clinical Biometrics, Department of Medical Computer Sciences, Medical University of Vienna, Vienna, Austria.
#' http://www.meduniwien.ac.at/user/georg.heinze/techreps/tr2_2004.pdf
#'
#' Heinze G (2006). A comparative investigation of methods for logistic regression with separated or
#' nearly separated data. Statistics in Medicine 25: 4216-4226.
#'
#' Puhr R, Heinze G, Nold M, Lusa L, Geroldinger A (2017). Firth's logistic regression with rare events:
#' accurate effect estimates and predictions? Statistics in Medicine 36: 2302-2317.
#'
#' Venzon DJ, Moolgavkar AH (1988). A method for computing profile-likelihood based confidence
#' intervals. Applied Statistics 37:87-94.
#'
#'
#' @useDynLib logistf, .registration = TRUE
#'
#' @seealso [add1.logistf()], [anova.logistf()]
#' @rdname logistf
logistf <-
function(formula, data, pl = TRUE, alpha = 0.05, control, plcontrol, modcontrol, firth = TRUE, init, weights, na.action, offset, plconf=NULL,flic=FALSE, model = TRUE, ...){
call <- match.call()
if(missing(control)) control<-logistf.control()
if(pl==TRUE & missing(plcontrol)) plcontrol<-logistpl.control()
if(missing(modcontrol)) modcontrol<-logistf.mod.control()
mf <- match.call(expand.dots =FALSE)
m <- match(c("formula", "data","weights","na.action","offset"), names(mf), 0L)
mf <- mf[c(1, m)]
mf$drop.unused.levels <- TRUE
mf[[1L]] <- quote(stats::model.frame)
mf <- eval(mf, parent.frame())
mt <- attr(mf, "terms")
y <- model.response(mf, type="any")
if(is.logical(y)){
y <- as.numeric(y)
}
else if(is.factor(y)){
if(length(levels(y))==2){
y <- as.numeric(y != levels(y)[1L])
}
}else if(!is.numeric(y)){
stop("Invalid response variable: must be logical or numeric or factor with 2 levels.")
}
n <- length(y)
x <- model.matrix(mt, mf)
k <- ncol(x)
cov.name <- labels(x)[[2]]
weight <- as.vector(model.weights(mf)) # avoid any problems with 1D or nx1 arrays by as.vector
offset <- as.vector(model.offset(mf))
if (is.null(offset)) offset<-rep(0,n)
if (is.null(weight)) weight<-rep(1,n)
if (missing(init)) init<-rep(0,k)
if (is.null(plconf)) { #if only intercept has to be fitted: calculate Wald CI for intercept
if(identical(cov.name,"(Intercept)")){
plconf <- NULL
rest_plconf <- 1
}
else{
plconf<-1:k
rest_plconf <- NULL
}
}
else { #if plconf is passed to logistf write NA to variables not specified
rest_plconf <- (1:k)[-plconf]
}
if (cov.name[1] == "(Intercept)") {
int <- 1
coltotest <- 2:k
}
else {
int <- 0
coltotest <-1:k
}
#Terms to be fitted:
if (!is.null(modcontrol$terms.fit)){
colfit <- modcontrol$terms.fit
rest_plconf <- (1:k)[-colfit] #compute confidence intervals for variables not specified in terms.fit with Wald
plconf <- intersect(plconf, colfit)
nterms <- length(colfit)
}
else {
colfit <- 1:k
nterms <- k
}
fit.full<-logistf.fit(x=x, y=y, weight=weight, offset=offset, firth, init, control=control, modcontrol = modcontrol)
modcontrolnull <-modcontrol
modcontrolnull$terms.fit <- 1
fit.null<-logistf.fit(x=x, y=y, weight=weight, offset=offset, firth, rep(0,k), control=control, modcontrol = modcontrolnull)
if(fit.full$iter>=control$maxit){
warning(paste("logistf.fit: Maximum number of iterations for full model exceeded. Try to increase the number of iterations or alter step size by passing 'logistf.control(maxit=..., maxstep=...)' to parameter control"))
}
if(fit.null$iter>=control$maxit){
warning(paste("logistf.fit: Maximum number of iterations for null model exceeded. Try to increase the number of iterations or alter step size by passing 'logistf.control(maxit=..., maxstep=...)' to parameter control"))
}
fit <- list(coefficients = fit.full$beta, alpha = alpha, terms=colnames(x), var = fit.full$var, df = nterms-int, loglik =c('full' = fit.full$loglik, 'null' = fit.null$loglik),
iter = c('full' = fit.full$iter, 'null' = fit.null$iter), n = sum(weight), y = y, formula = formula(formula), call = call, conv=fit.full$conv)
names(fit$conv)<-c("LL change","max abs score","beta change")
beta<-fit.full$beta
covs<-fit.full$var
pi<-fit.full$pi
fit$firth<-firth
fit$linear.predictors <- as.vector(x %*% beta + offset)
fit$predict <- fit.full$pi
fit$hat.diag <- fit.full$Hdiag
if(firth) fit$method <- "Penalized ML"
else fit$method <- "Standard ML"
if (!is.null(modcontrol$terms.fit)){ #consider for wald only covariance matrix with columns corresponding to variables in terms.fit
var.red <- fit$var[modcontrol$terms.fit,modcontrol$terms.fit]
vars <- diag(as.matrix(var.red))
waldprob <- wald_ci.lower <- wald_ci.upper <- vector(length = k)
waldprob[modcontrol$terms.fit] <- 1 - pchisq((beta[modcontrol$terms.fit]^2/vars), 1)
wald_ci.lower[modcontrol$terms.fit] <- as.vector(beta[modcontrol$terms.fit] + qnorm(alpha/2) * vars^0.5)
wald_ci.upper[modcontrol$terms.fit] <- as.vector(beta[modcontrol$terms.fit] + qnorm(1 - alpha/2) * vars^0.5)
}
else {
vars <- diag(covs)
waldprob <- 1 - pchisq((beta^2/vars), 1)
wald_ci.lower <- as.vector(beta + qnorm(alpha/2) * vars^0.5)
wald_ci.upper <- as.vector(beta + qnorm(1 - alpha/2) * vars^0.5)
}
fit$alpha<-alpha
fit$conflev<-1-alpha
if(pl){
#initialisation
betahist.lo<-vector(length(plconf),mode="list")
betahist.up<-vector(length(plconf),mode="list")
pl.conv<-matrix(0,length(plconf),4)
dimnames(pl.conv)[[1]]<-as.list(plconf)
dimnames(pl.conv)[[2]]<-as.list(c("lower, loglik","lower, beta", "upper, loglik", "upper, beta"))
LL.0 <- fit.full$loglik - qchisq(1 - alpha, 1)/2
pl.iter<-matrix(0,k,3)
icount<-0
iters <- vector() #number of iterations of fit.i per variable
for(i in plconf) {
icount<-icount+1
inter<-logistpl(x, y, beta, i, LL.0, firth, -1, offset, weight, plcontrol, modcontrol = modcontrol)
fit$ci.lower[i] <- inter$beta
pl.iter[i,1]<-inter$iter
betahist.lo[[icount]]<-inter$betahist
pl.conv.lower<-t(inter$conv)
inter<-logistpl(x, y, beta, i, LL.0, firth, 1, offset, weight, plcontrol, modcontrol = modcontrol)
fit$ci.upper[i] <- inter$beta
pl.iter[i,2]<-inter$iter
betahist.up[[icount]]<-inter$betahist
pl.conv.upper<-t(inter$conv)
pl.conv[icount,]<-cbind(pl.conv.lower,pl.conv.upper)
tofit <- setdiff(colfit,i)
if(length(tofit) == 0){
tofit <- 0
}
modcontrolpl <-modcontrol
modcontrolpl$terms.fit <- tofit
fit.i<-logistf.fit(x,y, weight=weight, offset=offset, firth, control=control, modcontrol = modcontrolpl)
pl.iter[i,3]<-fit.i$iter
fit$prob[i] <- 1-pchisq(2*(fit.full$loglik-fit.i$loglik),1)
fit$method.ci[i] <- "Profile Likelihood"
}
fit$pl.iter <- pl.iter
colnames(fit$pl.iter)<-c("Lower", "Upper", "Null model")
if(sum(fit$pl.iter>=plcontrol$maxit)){
notconv <- cov.name[apply(fit$pl.iter>=plcontrol$maxit, 1, sum)>0]
warning(paste("Nonconverged PL confidence limits: maximum number of iterations for variables:",paste0(notconv,collapse=", ")), " exceeded. Try to increase the number of iterations by passing 'logistpl.control(maxit=...)' to parameter plcontrol")
}
fit$betahist<-list(lower=betahist.lo, upper=betahist.up)
fit$pl.conv<-pl.conv
if(sum(iters>=control$maxit)>0){ #check if algorithms for all models have converged
notconv <- cov.name[iters>=control$maxit]
warning(paste("Maximum number of iterations for PLR test for variables:",paste0(notconv,collapse=", ")), " exceeded. P-value may be incorrect. Try to increase the number of iterations by passing 'logistf.control(maxit=...)' to parameter control")
}
for (i in rest_plconf){
fit$ci.lower[i] <- 0
fit$ci.upper[i] <- 0
fit$prob[i] <- 0
fit$method.ci[i] <- "-"
}
}
else {
fit$prob <- waldprob
fit$method.ci <- rep("Wald",k)
fit$ci.lower <- wald_ci.lower
fit$ci.upper <- wald_ci.upper
}
names(fit$prob) <- names(fit$ci.upper) <- names(fit$ci.lower) <- names(fit$coefficients) <- dimnames(x)[[2]]
#flic:
if (flic){
fit$flic <- TRUE
#calculate linear predictors ommiting the intercept
lp_flic <- fit$linear.predictors-fit$coef[1]
#determine ML estimate of intercept
response <- formula.tools::lhs.vars(formula)
fit_flic <- glm(as.formula(paste(response, paste("1"), sep=" ~ ")), family=binomial(link=logit),
data=data, offset=lp_flic)
W <- Matrix::Diagonal(x = fit_flic$fitted.values*(1-fit_flic$fitted.values))
tmp.var <- solve(t(x) %*% W %*% x)
beta0.se <- sqrt(tmp.var[1,1])
fit$coefficients <- c(fit_flic$coef, fit$coef[-1])
fit$ci.lower <- c(fit_flic$coef-beta0.se*1.96, fit$ci.lower[-1])
fit$ci.upper <- c(fit_flic$coef+beta0.se*1.96, fit$ci.upper[-1])
fit$linear.predictors <- fit_flic$linear
fit$predict <-fit_flic$fitted
fit$var <- tmp.var
fit$method.ci[1] <- "Wald"
}
else fit$flic <- FALSE
fit$control <- control
fit$modcontrol <- modcontrol
if (model) fit$model <- mf
fit$na.action <- attr(mf, "na.action")
attr(fit, "class") <- c("logistf")
fit
}
is.logistf<-function(object){
if(attr(object,"class")=="logistf") return(TRUE)
else return(FALSE)
}
#' @method coef logistf
coef.logistf<-function(object,...){
return(object$coefficients)
}
#' @method confint logistf
#' @exportS3Method confint logistf
confint.logistf<-function(object,parm, level=0.95, exp=FALSE, ...){
# in fact, level is already determined in the object and will be overwritten
level<-object$conflev
levstr<-paste(level*100,"%",sep="")
cimat<-cbind(object$ci.lower,object$ci.upper)
rownames(cimat)<-names(object$ci.lower)
colnames(cimat)<-c(paste("Lower ",levstr,sep=""),paste("Upper ",levstr,sep=""))
if(exp) cimat<-exp(cimat)
return(cimat)
}
#' @method vcov logistf
#' @exportS3Method vcov logistf
vcov.logistf<-function(object,...){
var <- object$var
colnames(var) <- rownames(var) <- object$terms
return(var)
}
#' @method family logistf
#' @exportS3Method family logistf
family.logistf<-function(object, ...){
return(stats::binomial())
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.