##
## The main attributeble risk function
##
attribrisk <- function(formula, data, weights, subset, na.action,
varmethod = c('jackknife', 'bootstrap', 'none'),
conf= .95, baseline, k=20, control,
model=FALSE, x=FALSE, y=FALSE, ...) {
Call <- match.call() ## save a copy of the call
varmethod <- match.arg(varmethod)
# create a call to model.frame() that contains the formula (required)
# and any other of the relevant optional arguments
# then evaluate it in the proper frame
indx <- match(c("formula", "data", "weights", "subset", "na.action"),
names(Call), nomatch=0)
if (indx[1] ==0) stop("A formula argument is required")
temp <- Call[c(1,indx)] # only keep the arguments we wanted
temp[[1]] <- as.name('model.frame') # change the function called
special <- c("strata", "expos")
temp$formula <- if(missing(data)) stats::terms(formula, special)
else stats::terms(formula, special, data=data)
# Make "expos" visible for attribRisk formulas without making it
# visible globally
riskenv <- new.env(parent= environment(formula))
assign("expos", function(x) x, envir=riskenv)
environment(temp$formula) <- riskenv
mf <- eval(temp, parent.frame()) #evaluate the model.frame call
Terms <- terms(mf)
xlevels <- stats::.getXlevels(Terms, mf)
## Extract relevant variables
Y <- stats::model.response(mf)
if (is.null(Y)) stop("formula does not have a response variable")
if (is.factor(Y)) {
if (length(levels(Y)) > 2)
stop("response must have only 2 levels")
Y <- as.numeric(Y) -1
}
if (!all(Y==0 | Y==1))
stop ("Response must be a 0/1 variable or a class with two levels")
n <- length(Y)
if (n==0) stop("No observations in the data set")
weights <- stats::model.weights(mf)
if (length(weights)==0) weights <- rep(1.0, n)
offset <- stats::model.offset(mf)
if (length(offset)==0) offset <- rep(0.0, n)
## We want to pass any ... args to the control function, but not pass things
## like "dats=mydata" where someone just made a typo. The use of ...
## is simply to allow things like "xval=10" with easier typing
extraArgs <- list(...)
if (length(extraArgs)) {
controlargs <- names(formals(coxph.control)) #legal arg names
indx <- pmatch(names(extraArgs), controlargs, nomatch=0L)
if (any(indx==0L))
stop(paste("Argument %s not matched", names(extraArgs)[indx==0L]))
}
if (missing(control)) control <- attribrisk.control(...)
## Find the exposure variables
expos <- untangle.specials(Terms, 'expos', 1)
if (length(expos$var) >0) {
number.of.exposures <- length(expos$var)
exposure.names <- expos$var
}
else stop("There must be at least one exposure variable")
# Did they put exposure into an interaction? Not allowed
temp2 <- attr(Terms, 'factors')[temp$var,, drop=FALSE]
if (any(temp2==1 & rep(attr(Terms, "order"), each=nrow(temp2)) >1))
stop("exposure variables cannot be in an interaction")
match.id <- NULL #matched analysis flag
strats <- attr(Terms, "specials")$strata
if (length(strats)) {
temp <- untangle.specials(Terms, 'strata', 1)
dropx <- temp$terms
if (length(temp$vars)==1) strata.keep <- mf[[temp$vars]]
else stop("Only 1 strata term allowed")
match.id <- as.numeric(strata.keep)
# subscripting the Terms object messes up predvars and assign attributes
# but we don't use them
Terms2 <- Terms[-dropx]
attr(Terms2, 'intercept') <- 1 # intercept required
}
else Terms2 <- Terms
X <- stats::model.matrix(Terms2, mf)
# The AR estimate is associated with some change: all the smokers into
# non-smokers, or all cholesterol levels to < 210, or ...
# If a baseline data set was provided, it gives new covariate values for
# each subject. If not, exposure variables are converted to their
# baseline values: reference level for a factor, smallest value if they
# are a character, FALSE if logical, or 0 if numeric.
#
# Create data frame m2, which has all exposures replaced with reference
# values. The user can supply a baseline data set, or we use the
# above numbers. The key value for AR is
# exp((original X - modified X) %*% beta)
# see equation 20 of the technical report.
if (missing(baseline)) {
m2 <- mf
m2[expos$var] <- lapply(mf[expos$var], function(x) {
if (is.character(x)) x <- factor(x)
if (is.factor(x)) x <- factor(rep(levels(x)[1], n),
levels=levels(x))
else if (is.logical(x)) rep(FALSE, n)
else rep(0, n)
})
}
else {
## First, does the baseline data contain the exposure variable names?
## If we check before trying to use a formula on it we can
## create a nicer error message.
##
# The baseline data only needs exposure variables
tform <- formula(paste("~", paste(expos$vars, collapse='+')))
environment(tform) <- riskenv # make expos() visible to tform
indx <- is.na(match(all.vars(tform), names(baseline)))
if (any(indx)) {
stop(paste("Variable(s)", all.vars(tform)[indx],
"not found in the baseline data set"))
}
if (missing(subset))
mbase <- stats::model.frame(tform, data=baseline, xlev=xlevels,
na.action=function(x) x)
else mbase <- stats::model.frame(tform, data=baseline, xlev=xlevels,
na.action=function(x) x, subset=subset)
# Delete any rows from mbase that were deleted due to missing in
# the original data mf
if (nrow(mbase) !=1 && !is.null(attr(mf, 'na.action'))) {
droprows <- as.integer(attr(mf, 'na.action')) #rows to toss
if (any(droprows) > nrow(mbase))
stop("data and baseline have different observation counts")
mbase <- mbase[-droprows,]
}
if (nrow(mbase) != nrow(mf) && nrow(mbase) !=1)
stop("data and baseline have different observation counts")
if (any(is.na(mbase)))
stop("baseline data set cannot contain missing values")
m2 <- mf
for (i in 1:number.of.exposures) {
temp <- mf[[expos$vars[i]]]
if (class(temp) == 'factor' || class(temp) == 'character') {
indx <- match(mbase[[i]], temp)
if (any(is.na(indx)))
stop("A reference value is not in the orginal data")
else mbase[[i]] <- as.factor(temp)[indx] #propogate the levels
}
else if (class(temp) != class(mbase[[i]]) &&
!(is.numeric(temp) && is.numeric(mbase[[i]]))) #int/double
stop("a baseline variable's class does not match that in data")
m2[[expos$vars[i]]] <- mbase[[i]]
}
}
xbase <- X - model.matrix(Terms2, data=m2,
contrasts.arg =attr(X, 'contrasts'))
if (length(strats)) {
X <- X[,-1, drop=F] #remove the intercept colum, for conditional fits
xbase <- xbase[,-1, drop=F]
Y <- Surv(rep(1, nrow(X)), Y) #make it a survival object
}
##
## Do the fit
##
fit <- attribrisk.fit(X, Y, weights, offset, match.id,
xbase, fit=TRUE)
# Set up to do the variance
#
# The "k" parameter can be a single number, in which case the data is
# divided into that many groups, or it can be a vector of length n
# containing the group numbers. The latter for the rare person who
# wants to control which subjects in which group directly.
# If there are strata (match) then the grouping needs to keep parts of
# a match together. Cross validation and bootstrap pull sets of matches.
# There is one other odd special case: this is one of the few routines
# where data with integer case weights is used, i.e., a weight of "8"
# means 8 individuals like this. Detect this special case as all
# weights >=1, all weights integer, and at least one weight >1. In this
# case we do simple jackknife, ignoring "k".
casewt <- (all(weights>=1 & weights==floor(weights)) && max(weights) >1)
if (casewt) {
if (!missing(k) && k<n)
stop("caseweights + grouped cross-validation not allowed")
if (length(strats)) xgrp <- match.id
else xgrp <- 1:n
}
else {
if (varmethod=="bootstrap") k <- n # effectively ignore k
if (length(k) ==1) {
if (k==0) k <- n #convient shortcut
if (k <2)
stop("jackknife error cannot be based on less than 2 groups")
# If the data is stratified, don't assign subgroups that break
# a strata
if (length(strats)) {
nstrat <- max(match.id)
if (k >= nstrat) xgrp <- match.id
else {
temp <- sample(1:floor(k), nstrat, replace=TRUE)
xgrp <- temp[match.id]
}
}
else {
if (k >=n) xgrp <- 1:n
else xgrp <- sample(1:floor(k), n, replace=TRUE)
}
}
else if (length(k) >=n) {
# missing values nuisance
temp <- attr(mf, "na.action")
if (!is.na(temp)) k <- k[-temp] #remove missing
if (length(k) != n) stop("wrong length for 'k'")
xgrp <- as.numeric(as.factor(k)) #turn it into integer groups
if (length(strats)) {
# Ensure that the user didn't break any matched sets
temp <- tapply(xgrp, match.id, length)
if (any(temp>1))
stop("cross-validation sets are not allowed to split a match")
}
}
else stop("wrong length for k")
}
# Compute a variance
if (varmethod=="jackknife")
fit$var <- attribrisk_jack(fit$attribrisk, match.id, X, Y, weights,
offset, xbase, xgroup=xgrp, casewt)
else if (varmethod=="bootstrap") {
temp <- attribrisk_boot(match.id, X, Y, weights, offset, xbase,
casewt, xgroup=xgrp,
conf, nboot=control$nboot, bci=control$bootci)
fit$var <- temp$var
fit$boot <- temp$boot
fit$boot.ci <- temp$boot.ci
}
fit$conf <- conf
fit$terms <- Terms
if(model)
fit$model <- mf
if(x)
fit$x <- X
if(y)
fit$y <- Y
if(!is.null(attr(mf, "na.action")))
fit$na.action <- attr(mf, "na.action")
fit$call <- Call
class(fit) <- c("attribrisk", if (length(strats)) "coxph" else "glm")
return(fit)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.