R/DiscSurvEstimation.R

Defines functions estMargProb estMeasures estSurvCens estSurv estCumHaz gumbel estRegFrailty estForest estTree estRegSmooth estReg

Documented in estCumHaz estForest estMargProb estMeasures estReg estRegFrailty estRegSmooth estSurv estSurvCens estTree gumbel

#################################
# Estimation of regression models

#' Discrete Survival Basic Regression Model Fitting
#' 
#' Wrapper for estimation of discrete survival general linear model. Several preprocessing
#' options such as time-dependent covariates are available.
#' 
#' @param dataShort Original data set in short format with each row corresponding to one independent 
#' observation (class "data.frame").
#' @param dataTransform Specification of the data transformation function from short to long format (class "character"). 
#' There are two available options: Without time dependent covariates ("dataLong") and with 
#' time dependent covariates ("dataLongTimeDep"). The default is set to the former.
#' @param formulaVariable Specifies the right hand side of the regression formula (class "formula").
#' The default is to use the discrete time variable, which corresponds to a covariate free hazard. 
#' It is recommended to always include the discrete time variable "timeInt".
#' @param timeColumn Character giving the column name of the observed times. 
#' It is required that the observed times are discrete (class "integer").
#' @param eventColumn Column name of the event indicator (class "character"). 
#' It is required that this is a binary variable with 1=="event" and 0=="censored".
#' @param idColumn Name of column of identification number of persons (class "character").
#' Default is set to use function \code{\link{dataLong}}, that does not need this argument.
#' @param timeAsFactor Should the time intervals be coded as factor (class "logical")? 
#' Default is FALSE. In the default settings the column is treated as quantitative variable (class "numeric").
#' @param family Specifies the assumption about the response distribution and the link function. 
#' Default value is the discrete survival continuation ratio model with logit-link 
#' (for comparison see \code{\link[stats]{family}}).
#' @param storeAugData Should the augmented data set be saved (class "logical")? 
#' Defaults is TRUE. The data set is available as attribute "augData".
#' @param \dots Specification of additional arguments in function \code{\link[stats]{glm}}.
#' @return Returns an object of class c("glm", "lm").
#' @author Thomas Welchowski \email{t.welchowski@@psychologie.uzh.ch}
#' @details Variables in argument \emph{formulaVariable} need to be separated by "+ ". 
#' For example if the two variables \emph{timeInt} and \emph{X1} should be included the formula would be
#' "~ timeInt + X1". The variable \emph{timeInt} is constructed before estimation of the model.
#' @note Model is rescaled to exclude intercepts.
#' @seealso \code{\link{dataLong}}, \code{\link{dataLongTimeDep}}, \code{\link[stats]{glm}}
#' @references 
#' \insertRef{tutzModelDisc}{discSurv}
#' @keywords survival
#' @examples
#' 
#' # Example with unemployment data
#' library(Ecdat)
#' data(UnempDur)
#' 
#' # Select subsample
#' SubUnempDur <- UnempDur [1:100, ]
#' 
#' # Estimate discrete survival continuation ratio model
#' estRegModel <- estReg(dataShort = SubUnempDur, dataTransform = "dataLong", 
#' formulaVariable =~ timeInt + age + ui + logwage * ui, 
#' eventColumn = "censor1", timeColumn = "spell")
#' summary(estRegModel)
#' 
#' # Predictions
#' SubUnempDurLong <- dataLong(dataShort = SubUnempDur, 
#' eventColumn = "censor1", timeColumn = "spell", timeAsFactor=TRUE)
#' preds <- predict(estRegModel, newdata = SubUnempDurLong)
#' head(preds)
#' 
#' @export estReg
estReg <- function(dataShort, dataTransform = "dataLong", 
                   formulaVariable =~ timeInt, timeColumn, eventColumn,
                   idColumn=NULL, timeAsFactor=TRUE, family=stats::binomial, 
                   storeAugData=TRUE, ... ){
  
  # Data transformation
  if(dataTransform=="dataLong"){
    datLong <- dataLong(dataShort = dataShort, 
                        timeColumn=timeColumn, eventColumn=eventColumn,
                        timeAsFactor=timeAsFactor)
  }
  if(dataTransform == "dataLongTimeDep"){
    datLong <- dataLongTimeDep(dataSemiLong = dataShort, 
                               timeColumn=timeColumn, eventColumn=eventColumn,
                               idColumn=idColumn, timeAsFactor=timeAsFactor)
  }
  
  # Construct formula
  constructFormula <- formula(paste("y ~ 0 +", 
                                    paste(attr(terms(formulaVariable),"term.labels"), 
                                          collapse=" + "), sep = " "))
  
  # Fit glm
  RES <- glm(formula = constructFormula, family = family, data = datLong, ...)
  if(storeAugData){
    attr(RES, "augData") <- datLong
  }
  return(RES)
}

#' Discrete Survival Additive Regression Model Fitting
#' 
#' Wrapper for estimation of discrete survival generalized additive models. Several preprocessing
#' options such as time-dependent covariates are available.
#' 
#' @param dataShort Original data set in short format with each row corresponding to one independent 
#' observation (class "data.frame").
#' @param dataTransform Specification of the data transformation function from short to long format (class "character"). 
#' There are two available options: Without time dependent covariates ("dataLong") and with 
#' time dependent covariates ("dataLongTimeDep"). The default is set to the former.
#' @param formulaVariable Specifies the right hand side of the regression formula (class "formula").
#' The default is to use the discrete time variable, which corresponds to a covariate free hazard. 
#' It is recommended to always include the discrete time variable "timeInt".
#' @param timeColumn Character giving the column name of the observed times. 
#' It is required that the observed times are discrete (class "integer").
#' @param eventColumn Column name of the event indicator (class "character"). 
#' It is required that this is a binary variable with 1=="event" and 0=="censored".
#' @param idColumn Name of column of identification number of persons (class "character").
#' Default is set to use function \code{\link{dataLong}}, that does not need this argument.
#' @param timeAsFactor Should the time intervals be coded as factor (class "logical")? 
#' Default is FALSE. In the default settings the column is treated as quantitative variable (class "numeric").
#' @param family Specifies the assumption about the response distribution and the link function. 
#' Default value is the discrete survival continuation ratio model with logit-link 
#' (for comparison see \code{\link[stats]{family}}).
#' @param storeAugData Should the augmented data set be saved (class "logical")? 
#' Defaults is TRUE. The data set is available as attribute "augData".
#' @param \dots Specification of additional arguments in function \code{\link[mgcv]{gam}}.
#' @return Returns an object of class "gam".
#' @author Thomas Welchowski \email{t.welchowski@@psychologie.uzh.ch}
#' @details Variables in argument \emph{formulaVariable} need to be separated by "+ ". 
#' For example if the two variables \emph{timeInt} and \emph{X1} should be included as smooth functions
#' the formula would be "~ s(timeInt) + s(X1)". The variable \emph{timeInt} is constructed before estimation of the model.
#' @note Splines can also be used to include heterogeneity of mixed models with
#' penalized fixed effects. An advantage of this approach is that
#' one does not have to assume a specific distribution for the random effects.
#' 
#' Model is rescaled to exclude the intercept.
#' @seealso \code{\link{dataLong}}, \code{\link{dataLongTimeDep}}, 
#' \code{\link[mgcv]{gam}}, \code{\link{estRegFrailty}}
#' @references 
#' \insertRef{bergerTutorial}{discSurv} \cr\cr
#' \insertRef{tutzModelDisc}{discSurv}
#' @keywords survival
#' @examples
#' 
#' # Example with unemployment data
#' library(Ecdat)
#' data(UnempDur)
#' 
#' # Select subsample
#' SubUnempDur <- UnempDur[1:100, ]
#' 
#' # Estimate discrete survival continuation ratio model
#' estRegSmoothModel <- estRegSmooth(dataShort = SubUnempDur, dataTransform = "dataLong", 
#' formulaVariable =~ s(timeInt) + s(age) + ui + logwage * ui, 
#' eventColumn = "censor1", timeColumn = "spell")
#' summary(estRegSmoothModel)
#' 
#' # Estimated smooth functions
#' plot(estRegSmoothModel)
#' 
#' @export estRegSmooth
estRegSmooth <- function(dataShort, dataTransform = "dataLong", 
                   formulaVariable =~ s(timeInt), timeColumn, eventColumn,
                   idColumn=NULL, timeAsFactor=FALSE,
                   family=stats::binomial, storeAugData=TRUE, ... ){
  
  # Data transformation
  if(dataTransform=="dataLong"){
    datLong <- dataLong(dataShort = dataShort, 
                        timeColumn=timeColumn, eventColumn=eventColumn,
                        timeAsFactor=timeAsFactor)
  }
  if(dataTransform == "dataLongTimeDep"){
    datLong <- dataLongTimeDep(dataSemiLong = dataShort, 
                               timeColumn=timeColumn, eventColumn=eventColumn,
                               idColumn=idColumn, timeAsFactor=timeAsFactor)
  }
  
  # Construct formula
  constructFormula <- formula(paste("y ~ 0 +", 
                                    paste(attr(terms(formulaVariable),"term.labels"), 
                                          collapse=" + "), sep = " "))
  
  # Fit glm
  RES <- mgcv::gam(formula = constructFormula, family = family, data = datLong,
                   ...)
  if(storeAugData){
    attr(RES, "augData") <- datLong
  }
  return(RES)
}

#' Discrete Survival Tree Fitting
#' 
#' Wrapper for estimation of discrete survival classification and regression tree. 
#' Several preprocessing options such as time-dependent covariates are available.
#' 
#' @param dataShort Original data set in short format with each row corresponding to one independent 
#' observation (class "data.frame").
#' @param dataTransform Specification of the data transformation function from short to long format (class "character"). 
#' There are two available options: Without time dependent covariates ("dataLong") and with 
#' time dependent covariates ("dataLongTimeDep"). The default is set to the former.
#' @param formulaVariable Specifies the right hand side of the regression formula (class "formula").
#' The default is to use the discrete time variable, which corresponds to a covariate free hazard. 
#' It is recommended to always include the discrete time variable "timeInt".
#' @param timeColumn Character giving the column name of the observed times. 
#' It is required that the observed times are discrete (class "integer").
#' @param eventColumn Column name of the event indicator (class "character"). 
#' It is required that this is a binary variable with 1=="event" and 0=="censored". 
#' @param idColumn Name of column of identification number of persons (class "character").
#' Default is set to use function \code{\link{dataLong}}, that does not need this argument.
#' @param timeAsFactor Should the time intervals be coded as factor (class "logical")? 
#' Default is FALSE. In the default settings the column is treated as quantitative variable (class "numeric").
#' @param storeAugData Should the augmented data set be saved (class "logical")? 
#' Defaults is TRUE. The data set is available as attribute "augData".
#' @param responseAsFactor Should the response be converted to factor? Default is TRUE. 
#' Representation as factor is technically important for classification split rules such as
#' Gini index. For details see \code{\link[rpart]{rpart}}.
#' @param \dots Specification of additional arguments in function \code{\link[rpart]{rpart}}.
#' @return Returns an object of class "rpart".
#' @author Thomas Welchowski \email{t.welchowski@@psychologie.uzh.ch}
#' @details Variables in argument \emph{formulaVariable} need to be separated by "+ ". 
#' For example, if the two variables \emph{timeInt} and \emph{X1} should be included,
#' the formula would be "~ timeInt + X1". The variable \emph{timeInt} is constructed before 
#' estimation of the model.
#' @seealso \code{\link{dataLong}}, \code{\link{dataLongTimeDep}}, \code{\link[rpart]{rpart}}
#' @references 
#' \insertRef{schmidSurvivalTreeEpi}{discSurv}
#' @keywords survival
#' @examples
#' 
#' # Example with unemployment data
#' library(Ecdat)
#' data(UnempDur)
#' 
#' # Select subsample
#' SubUnempDur <- UnempDur[1:100, ]
#' 
#' # Estimate discrete survival tree
#' estTreeModel <- estTree(dataShort = SubUnempDur, dataTransform = "dataLong", 
#' formulaVariable =~ timeInt + age + ui + logwage + ui, 
#' eventColumn = "censor1", timeColumn = "spell")
#'
#' # Graphical illustration of fitted tree
#' rpart.plot::rpart.plot(estTreeModel)
#' 
#' @export estTree
estTree <- function(dataShort, dataTransform = "dataLong", 
                    formulaVariable =~ timeInt, timeColumn, 
                    eventColumn, idColumn=NULL, 
                    timeAsFactor=FALSE, storeAugData=TRUE, 
                    responseAsFactor=TRUE, 
                    ... ){
  
  # Data transformation
  if(dataTransform=="dataLong"){
    datLong <- dataLong(dataShort = dataShort, 
                        timeColumn=timeColumn, eventColumn=eventColumn,
                        timeAsFactor=timeAsFactor)
  }
  if(dataTransform == "dataLongTimeDep"){
    datLong <- dataLongTimeDep(dataSemiLong = dataShort, 
                               timeColumn=timeColumn, eventColumn=eventColumn,
                               idColumn=idColumn, timeAsFactor = timeAsFactor)
  }
  
  if(responseAsFactor){
    datLong$y <- factor(datLong$y, levels=c(0, 1))
  }
  
  # Construct formula
  constructFormula <- formula(paste("y ~", 
                                    paste(attr(terms(formulaVariable),"term.labels"), 
                                          collapse=" + "), sep = " "))
  
  # Fit glm
  RES <- rpart::rpart(formula = constructFormula, data = datLong, method="class", ...)
  if(storeAugData){
    attr(RES, "augData") <- datLong
  }
  return(RES)
}

#' Discrete Survival Random Forest Fitting
#' 
#' Wrapper for estimation of discrete survival random forest. 
#' Several preprocessing options such as time-dependent covariates are available.
#' 
#' @param dataShort Original data set in short format with each row corresponding to one independent 
#' observation (class "data.frame").
#' @param dataTransform Specification of the data transformation function from short to long format (class "character"). 
#' There are two available options: Without time dependent covariates ("dataLong") and with 
#' time dependent covariates ("dataLongTimeDep"). The default is set to the former.
#' @param formulaVariable Specifies the right hand side of the regression formula (class "formula").
#' The default is to use the discrete time variable, which corresponds to a covariate free hazard. 
#' It is recommended to always include the discrete time variable "timeInt".
#' @param timeColumn Character giving the column name of the observed times. 
#' It is required that the observed times are discrete (class "integer").
#' @param eventColumn Column name of the event indicator (class "character"). 
#' It is required that this is a binary variable with 1=="event" and 0=="censored". 
#' @param idColumn Name of column of identification number of persons (class "character").
#' Default is set to use function \code{\link{dataLong}}, that does not need this argument.
#' @param timeAsFactor Should the time intervals be coded as factor (class "logical")? 
#' Default is FALSE. In the default settings the column is treated as quantitative variable (class "numeric").
#' @param storeAugData Should the augmented data set be saved (class "logical")? 
#' Defaults is TRUE. The data set is available as attribute "augData".
#' @param responseAsFactor Should the response be converted to factor? Default is TRUE. 
#' Representation as factor is technically important for classification split rules such as
#' Gini index. For details see \code{\link[ranger]{ranger}}.
#' @param \dots Specification of additional arguments in function \code{\link[ranger]{ranger}}.
#' @return Returns an object of class "ranger".
#' @author Thomas Welchowski \email{t.welchowski@@psychologie.uzh.ch}
#' @details Variables in argument \emph{formulaVariable} need to be separated by "+ ". 
#' For example, if the two variables \emph{timeInt} and \emph{X1} should be included,
#' the formula would be "~ timeInt + X1". The variable \emph{timeInt} is constructed before 
#' estimation of the model.
#' @seealso \code{\link{dataLong}}, \code{\link{dataLongTimeDep}}, \code{\link[ranger]{ranger}}
#' @references 
#' \insertRef{schmidHellinger}{discSurv}
#' @keywords survival
#' @examples
#' 
#' # Example with unemployment data
#' library(Ecdat)
#' data(UnempDur)
#' 
#' # Select subsample
#' SubUnempDur <- UnempDur[1:100, ]
#' 
#' # Estimate discrete survival random forest
#' estRFModel <- estForest(dataShort = SubUnempDur, dataTransform = "dataLong", 
#' formulaVariable =~ timeInt + age + ui + logwage + ui, 
#' eventColumn = "censor1", timeColumn = "spell")
#' estRFModel
#' 
#' @export estForest
estForest <- function(dataShort, dataTransform = "dataLong", 
                    formulaVariable =~ timeInt, timeColumn, 
                    eventColumn, idColumn=NULL,
                    timeAsFactor=FALSE, storeAugData=TRUE, 
                    responseAsFactor=TRUE, ... ){
  
  # Data transformation
  if(dataTransform=="dataLong"){
    datLong <- dataLong(dataShort = dataShort, 
                        timeColumn=timeColumn, eventColumn=eventColumn,
                        timeAsFactor = timeAsFactor)
  }
  if(dataTransform == "dataLongTimeDep"){
    datLong <- dataLongTimeDep(dataSemiLong = dataShort, 
                               timeColumn=timeColumn, eventColumn=eventColumn,
                               idColumn=idColumn, timeAsFactor = timeAsFactor)
  }
  
  # Convert response to factor
  if(responseAsFactor){
    datLong$y <- factor(datLong$y, levels=c(0, 1))
  }
  
  # Construct formula
  constructFormula <- formula(paste("y ~", 
                                    paste(attr(terms(formulaVariable),"term.labels"), 
                                          collapse=" + "), sep = " "))
  # Fit
  RES <- ranger::ranger(formula = constructFormula, data = datLong, 
                        probability=TRUE, ...)
  if(storeAugData){
    attr(RES, "augData") <- datLong
  }
  return(RES)
}

#' Discrete Survival Frailty Regression Model Fitting
#' 
#' Wrapper for estimation of discrete survival frailty regression model. 
#' Smooth functions of covariates are supported. Several preprocessing options, 
#' such as time-dependent covariates, are available.
#' 
#' @param dataShort Original data set in short format with each row corresponding to one independent 
#' observation (class "data.frame").
#' @param dataTransform Specification of the data transformation function from short to long format (class "character"). 
#' There are two available options: Without time dependent covariates ("dataLong") and with 
#' time dependent covariates ("dataLongTimeDep"). The default is set to the former.
#' @param formulaVariable Specifies the right hand side of the regression formula (class "formula").
#' The default is to use the discrete time variable, which corresponds to a covariate free hazard. 
#' It is recommended to always include the discrete time variable "timeInt".
#' @param timeColumn Character giving the column name of the observed times. 
#' It is required that the observed times are discrete (class "integer").
#' @param eventColumn Column name of the event indicator (class "character"). 
#' It is required that this is a binary variable with 1=="event" and 0=="censored". 
#' @param idColumn Name of column of identification number of persons (class "character").
#' Default is set to use function \code{\link{dataLong}}, that does not need this argument.
#' @param timeAsFactor Should the time intervals be coded as factor (class "logical")? 
#' Default is FALSE. In the default settings the column is treated as quantitative variable (class "numeric").
#' @param storeAugData Should the augmented data set be saved (class "logical")? 
#' Defaults is TRUE. The data set is available as attribute "augData".
#' @param \dots Specification of additional arguments in function \code{\link[ranger]{ranger}}.
#' @return Returns an object of class "merMod".
#' @author Thomas Welchowski \email{t.welchowski@@psychologie.uzh.ch}
#' @details Variables in argument \emph{formulaVariable} need to be separated by "+ ". 
#' For example, if the two variables \emph{timeInt} and \emph{X1} should be included,
#' the formula would be "~ timeInt + X1". The variable \emph{timeInt} is constructed before 
#' estimation of the model.
#' @note Model is rescaled to exclude the intercept.
#' @seealso \code{\link{dataLong}}, \code{\link{dataLongTimeDep}}, 
#' \code{\link[lme4]{glmer}}, \code{\link{estRegSmooth}}
#' @references 
#' \insertRef{tutzModelDisc}{discSurv}
#' @keywords survival
#' @examples
#' 
#' # Example with unemployment data
#' library(Ecdat)
#' data(UnempDur)
#' 
#' # Select subsample
#' SubUnempDur <- UnempDur[1:100, ]
#' 
#' # Estimate discrete survival frailty regression model
#' estFrailtyModel <- estRegFrailty(dataShort = SubUnempDur, 
#' dataTransform = "dataLong", formulaVariable =~timeInt + (1 | obj), 
#' eventColumn = "censor1", timeColumn = "spell")
#' estFrailtyModel
#' 
#' @export estRegFrailty
estRegFrailty <- function(dataShort, dataTransform = "dataLong", 
                      formulaVariable =~ timeInt + (1|obj), timeColumn, 
                      eventColumn, idColumn=NULL,
                      timeAsFactor=FALSE, storeAugData=TRUE, ... ){
  
  # Data transformation
  if(dataTransform=="dataLong"){
    datLong <- dataLong(dataShort = dataShort, 
                        timeColumn=timeColumn, eventColumn=eventColumn,
                        timeAsFactor = timeAsFactor)
  }
  if(dataTransform == "dataLongTimeDep"){
    datLong <- dataLongTimeDep(dataSemiLong = dataShort, 
                               timeColumn=timeColumn, eventColumn=eventColumn,
                               idColumn=idColumn, timeAsFactor = timeAsFactor)
  }
  
  # Construct formula
  constructFormula <- formula(paste("y ~ 0 +", 
                                    paste(attr(terms(formulaVariable),"term.labels"), 
                                          collapse=" + "), sep = " "))
  
  # Fit
  RES <- lme4::glmer(formula = constructFormula, data = datLong, 
                     family=binomial, ...)
  if(storeAugData){
    attr(RES, "augData") <- datLong
  }
  return(RES)
}

##########################
# gumbel

# Description:
# Specifies the link function of gumbel distribution function in the contex of general, linear models
# Can be used in function glm

# Output
# Should be used in context with glm in the base package

#' Gumbel Link Function
#' 
#' Constructs the link function with gumbel distribution in approriate format
#' for use in generalized, linear models.
#' 
#' Insert this function into a binary regression model
#' 
#' @author Thomas Welchowski \email{t.welchowski@@psychologie.uzh.ch}
#' 
#' Matthias Schmid \email{matthias.schmid@@imbie.uni-bonn.de}
#' @seealso \code{\link{glm}}
#' @references 
#' \insertRef{tutzModelDisc}{discSurv}
#' @keywords survival
#' @examples
#' 
#' # Example with copenhagen stroke study
#' library(pec)
#' data(cost)
#' head(cost)
#' 
#' # Take subsample and convert time to months
#' costSub <- cost [1:50, ]
#' costSub$time <- ceiling(costSub$time/30)
#' costLong <- dataLong(dataShort = costSub, timeColumn = "time", eventColumn = "status",
#' timeAsFactor=TRUE)
#' gumbelModel <- glm(formula = y ~ timeInt + diabetes, data = costLong, 
#' family = binomial(link = gumbel()))
#' 
#' # Estimate hazard given prevStroke and no prevStroke
#' hazPrevStroke <- predict(gumbelModel, newdata=data.frame(timeInt = factor(1:143), 
#' diabetes = factor(rep("yes", 143), levels = c("no", "yes"))), type = "response")
#' hazWoPrevStroke <- predict(gumbelModel, newdata = data.frame(timeInt = factor(1:143), 
#' diabetes=factor(rep("no", 143), levels = c("no", "yes"))), type = "response")
#' 
#' # Estimate survival function
#' SurvPrevStroke <- cumprod(1 - hazPrevStroke)
#' SurvWoPrevStroke <- cumprod(1 - hazWoPrevStroke)
#' 
#' # Example graphics of survival curves with and without diabetes
#' plot(x = 1:143, y = SurvWoPrevStroke, type = "l", xlab = "Months", 
#' ylab = "S (t|x)", las = 1, lwd = 2, ylim = c(0,1))
#' lines(x = 1:143, y = SurvPrevStroke, col = "red", lwd = 2)
#' legend("topright", legend = c("Without diabetes", "Diabetes"), 
#' lty = 1, lwd =2, col = c("black", "red"))
#' 
#' @export gumbel
gumbel <- function()
{
  linkfun <- function(mu) - log(- log(mu) )
  linkinv <- function(eta) exp(- exp(-eta))
  mu.eta <- function(eta) exp(- exp(-eta)) * exp(-eta)
  valideta <- function(eta) TRUE
  link <- paste("gumbel")
  structure(list(linkfun = linkfun, linkinv = linkinv,
                 mu.eta = mu.eta, valideta = valideta,
                 name = link), class = "link-glm")
}

#########################
# estCumHaz

#' Cumulative Hazard Function
#' 
#' Estimates the cumulative hazard function \eqn{\Gamma}(T = t|x) based on estimated hazard rates.
#' The hazard rates may or may not depend on covariates. The covariates have to
#' be equal across all estimated hazard rates. Therefore the given hazard rates
#' should only vary over time.
#' 
#' The argument \emph{hazards} must be given for all intervals \eqn{\left[ a_0, a_1 \right), \left[ a_1,
#' a_2 \right), ..., \left[ a_{q-1}, a_q \right), \left[ a_q, \infty \right)}.
#' 
#' @param hazards Estimated hazard rates (class "numeric")
#' @return Estimated probabilities of survival (class "numeric")
#' @note It is assumed that all time points up to the last 
#' theoretical interval \eqn{\left[ a_q, \infty \right)} are available. If not already present, 
#' these can be added manually.
#' @author Thomas Welchowski \email{t.welchowski@@psychologie.uzh.ch}
#' @references 
#' \insertRef{tutzModelDisc}{discSurv}
#' @keywords survival
#' @examples
#' 
#' # Example unemployment data
#' library(Ecdat)
#' data(UnempDur)
#' 
#' # Select subsample
#' subUnempDur <- UnempDur[1:100, ]
#' 
#' # Convert to long format
#' UnempLong <- dataLong(dataShort = subUnempDur, timeColumn = "spell", eventColumn = "censor1")
#' 
#' # Estimate binomial model with logit link
#' Fit <- glm(formula = y ~ timeInt + age + logwage, data=UnempLong, family = binomial())
#' 
#' # Estimate discrete survival function given age, logwage of first person
#' hazard <- predict(Fit, newdata = subset(UnempLong, obj == 1), type = "response")
#' cumHazCondX <- estCumHaz(c(hazard, 1))
#' cumHazCondX
#' 
#' @export estCumHaz
estCumHaz <- function(hazards) {
  # Input checks
  if(!is.vector(hazards)) {stop(
    "Argument *hazards* is not a vector! Please specify a numeric vector of estimated hazard rates.")}
  if(!all(-sqrt(.Machine$double.eps) <= hazards & 
          hazards - 1 <= sqrt(.Machine$double.eps))) {
    stop("Argument *hazards* is not a vector of probabilities! Please specify a numeric vector of estimated hazard rates.")}
  
  erg <- cumsum(hazards)
  names(erg) <- paste("CumHaz(T=", 1:length(hazards), ")", sep="")
  class(erg) <- "discSurvEstCumHaz"
  return(erg)
}

#########################
# estSurv

# Description
# Estimates the discrete survival function given discrete hazard rates
# The hazard rates may or not depend on covariates, but the covariates have to be equal for all estimates of the hazard rate!
# Therefore the hazard rates only depend on time. 

# Input
# haz: Estimated hazard rates, which only depend on a time interval (class "numeric). 
# It is assumed that the estimates contain the intervals [0, a_1), [a_1, a_2), ..., [a_{q-1}, a_{q}) and the last interval is computed

# Output
# Survival function over all intervals [0, a_1), [a_1, a_2), ..., [a_{q-1}, a_{q}), [a_q, a_{\infty}



#' Survival Function
#' 
#' Estimates the survival function S(T = t|x) based on estimated hazard rates.
#' The hazard rates may or may not depend on covariates. The covariates have to
#' be equal across all estimated hazard rates. Therefore the given hazard rates
#' should only vary over time.
#' 
#' The argument \emph{hazards} must be given for all intervals \eqn{\left[ a_0, a_1 \right), \left[ a_1,
#' a_2 \right), ..., \left[ a_{q-1}, a_q \right), \left[ a_q, \infty \right)}.
#' 
#' @param hazards Estimated hazard rates (class "numeric")
#' @return Estimated probabilities of survival (class "numeric")
#' @note It is assumed that all time points up to the last 
#' theoretical interval \eqn{\left[ a_q, \infty \right)} are available. If not already present, 
#' these can be added manually.
#' @author Thomas Welchowski \email{t.welchowski@@psychologie.uzh.ch}
#' @seealso \code{\link{estMargProb}}
#' @references 
#' \insertRef{tutzModelDisc}{discSurv}
#' @keywords survival
#' @examples
#' 
#' 
#' # Example unemployment data
#' library(Ecdat)
#' data(UnempDur)
#' 
#' # Select subsample
#' subUnempDur <- UnempDur [1:100, ]
#' 
#' # Convert to long format
#' UnempLong <- dataLong(dataShort = subUnempDur, timeColumn = "spell", eventColumn = "censor1")
#' head(UnempLong)
#' 
#' # Estimate binomial model with logit link
#' Fit <- glm(formula = y ~ timeInt + age + logwage, data=UnempLong, family = binomial())
#' 
#' # Estimate discrete survival function given age, logwage of first person
#' hazard <- predict(Fit, newdata = subset(UnempLong, obj == 1), type = "response")
#' SurvivalFuncCondX <- estSurv(c(hazard, 1))
#' plot(SurvivalFuncCondX)
#' 
#' @export estSurv
estSurv <- function(hazards) {
  
  # Input checks
  if(!is.vector(hazards)) {stop(
    "Argument *hazards* is not a vector! Please specify a numeric vector of estimated hazard rates.")}
  if(!all(-sqrt(.Machine$double.eps) <= hazards & 
          hazards - 1 <= sqrt(.Machine$double.eps))) {
    stop("Argument *hazards* is not a vector of probabilities! Please specify a numeric vector of estimated hazard rates.")}
  
  erg <- cumprod(1 - hazards)
  names(erg) <- paste("S(T=", 1:length(hazards), ")", sep="")
  class(erg) <- "discSurvEstSurv"
  return(erg)
}

#########################
# estSurvCens

# Description
# Estimates the censoring survival function using life table estimator

#' Survival Function Of Censoring Process
#' 
#' Estimates the marginal survival function G(T=t) of the censoring process based on 
#' a life table estimator. Compatible with single event and competing risks data.
#' 
#' 
#' @param dataShort Data in original short format (class "data.frame").
#' @param timeColumn Name of column with discrete time intervals (class "character").
#' @param eventColumns Names of the event columns of \code{dataShort} (class "character"). In the
#' competing risks case the event columns have to be in dummy encoding format
#' (class "numeric").
#' @param censInterval Assumption about when censoring takes places within an interval
#' on the continuous time scale (class "character"). 
#' Possible values are "start", "middle", "end". Default is "middle".
#' @return Named vector of estimated survival function of the censoring process
#' for all time points except the last theoretical interval.
#' @note In the censoring survival function the last time interval \eqn{\left[ a_q, \infty \right)}
#' has the probability of zero.
#' @author Thomas Welchowski \email{t.welchowski@@psychologie.uzh.ch}
#' @seealso \code{\link{estSurv}}
#' @references 
#' \insertRef{tutzModelDisc}{discSurv}
#' @keywords survival
#' @examples
#' 
#' # Load unemployment data
#' library(Ecdat)
#' data(UnempDur)
#' 
#' # Select subsample
#' subUnempDur <- UnempDur [1:100, ]
#' 
#' ######################
#' # Single event example
#' 
#' # Estimate censoring survival function G(t)
#' estG <- estSurvCens(dataShort = subUnempDur, timeColumn = "spell", 
#' eventColumns = "censor1")
#' estG
#' 
#' #########################
#' # Competing risks example
#' 
#' # Estimate censoring survival function G(t)
#' estG <- estSurvCens(dataShort = subUnempDur, timeColumn = "spell", 
#' eventColumns = c("censor1", "censor2", "censor3", "censor4"))
#' estG
#' 
#' 
#' @export estSurvCens
estSurvCens <- function(dataShort, timeColumn, eventColumns, 
                        censInterval="middle") {

  # # Expand short data to long competing risks format
  # dataLongFormat  <- dataLongCompRisks (dataShort = dataShort, timeColumn = timeColumn, eventColumns = eventColumns, 
  #                    timeAsFactor = timeAsFactor)
  # 
  # # Construct censoring data
  # dataShortLongCens <- dataCensoring (dataShortLong = dataLongFormat, respColumn = "y", 
  #                                   timeColumn = "timeInt")

  # Convert to censoring format
  dataShortCens <- dataCensoring (dataShort = dataShort, eventColumns = eventColumns,
                                     timeColumn = timeColumn)

  # Estimate nonparametric survival function of censoring variable
  tempLifeTab <- lifeTable(dataShort = dataShortCens, timeColumn = "timeCens", 
                            eventColumn = "yCens", censInterval=censInterval)
  preG <- tempLifeTab [[1]] [, "S"]
  GT <- c(1, preG)
  names(GT) <- paste("t=", 0:length(preG), sep="")
  return(GT)
}

#########################
# estMeasures

#' Fit Discrete Survival Measures Based On Hazards
#' 
#' Wrapper to estimate survival functions, cumulative hazards and 
#' marginal probabilities of all individuals of a given data set.
#' 
#' @param hazards Estimated hazards of the event (class "numeric").
#' @param obj Integer identification number of each individual. 
#' Usually this information is computed during data augmentation (class "numeric").
#' @return List of estimated measures (class "list"). There are three list elements with
#' following contents: \itemize{ 
#' \item {surv} List of estimated survival curves for each individual.
#' \item {cumHaz} List of estimated cumulative hazards for each individual.
#' \item {margProb} List of estimated marginal probabilities for each individual. 
#' }
#' @note It is assumed that the data set was preprocessed with functions such as
#' \code{\link{dataLong}} or \code{\link{dataLongTimeDep}}.
#' @author Thomas Welchowski \email{t.welchowski@@psychologie.uzh.ch}
#' @seealso \code{\link{estSurv}}, \code{\link{estCumHaz}}, 
#' \code{\link{estMargProb}}
#' @references 
#' \insertRef{tutzModelDisc}{discSurv}
#' @keywords survival
#' @examples
#' 
#' # Load unemployment data
#' library(Ecdat)
#' data(UnempDur)
#' 
#' # Select subsample
#' subUnempDur <- UnempDur[1:100, ]
#' 
#' #########################
#' # Estimate hazard rates
#' 
#' estRegModel <- estReg(dataShort = subUnempDur, dataTransform = "dataLong", 
#' formulaVariable =~ timeInt + age + ui + logwage * ui, 
#' eventColumn = "censor1", timeColumn = "spell")
#' estHaz <- predict(estRegModel, type="response")
#' 
#' ######################
#' # Single event example
#' 
#' # Estimate all survival functions of a given data set
#' measures1 <- estMeasures(hazards=estHaz, 
#' obj=attr(estRegModel, "augData")$obj)
#' 
#' # Survival function of first individual
#' measures1$surv[[1]] 
#' 
#' # Cumulative hazard of first individual
#' measures1$cumHaz[[1]]
#' 
#' # Marginal probabilities of first individual
#' measures1$margProb[[1]]
#' 
#' @export estMeasures
estMeasures <- function(hazards, obj) {
  # Prepare data
  splitData <- split(hazards, f=obj)
  RES <- list(surv=NA, cumHaz=NA, margProb=NA)
  
  # Estimate survival functions
  RES$surv <- lapply(splitData, function(x){
    estSurv(c(x, 1))
  })

  # Estimate cumulative hazards
  RES$cumHaz <- lapply(splitData, function(x){
    estCumHaz(c(x, 1))
  })
  
  # Estimate marginal probabilities
  RES$margProb <- lapply(splitData, function(x){
    estMargProb(c(x, 1))
  })
  
  return(RES)
}

#########################
# estMargProb

# Description
# Estimates the discrete marginal probability P(T = t) given discrete hazard rates
# The hazard rates may or not depend on covariates, but the covariates have to be equal for all estimates of the hazard rate!
# Therefore the hazard rates only depend on time. 

# Input
# haz: Estimated hazard rates, which only depend on a time interval (class "numeric"). 
# It is assumed that the estimates contain the intervals [0, a_1), [a_1, a_2), ..., [a_{q-1}, a_{q}) and the last interval is computed

# Output
# Survival function over all intervals [0, a_1), [a_1, a_2), ..., [a_{q-1}, a_{q}), [a_q, a_{\infty}



#' Marginal Probabilities
#' 
#' Estimates the marginal probability P(T=t|x) based on estimated discrete hazards.
#' The discrete hazards may or may not depend on covariates. The covariates have to
#' be equal across all estimated hazard rates. Therefore the given discrete hazards
#' should only vary over time.
#' 
#' The argument \emph{hazards} must be given for all intervals \eqn{\left[ a_0, a_1 \right), \left[ a_1,
#' a_2 \right), ..., \left[ a_{q-1}, a_q \right), \left[ a_q, \infty \right)}.
#' 
#' @param hazards Estimated discrete hazards (class "numeric")
#' @return Estimated marginal probabilities (class "numeric")
#' @note It is assumed that all time points up to the last interval \eqn{\left[ a_q, \infty \right)}
#' are available. If not already present, these can be added manually.
#' @author Thomas Welchowski \email{t.welchowski@@psychologie.uzh.ch}
#' @seealso \code{\link{estSurv}}
#' @references 
#' \insertRef{tutzModelDisc}{discSurv}
#' @keywords survival
#' @examples
#' 
#' # Example unemployment data
#' library(Ecdat)
#' data(UnempDur)
#' 
#' # Select subsample
#' subUnempDur <- UnempDur [1:100, ]
#' 
#' # Convert to long format
#' UnempLong <- dataLong(dataShort = subUnempDur, timeColumn = "spell", eventColumn = "censor1")
#' head(UnempLong)
#' 
#' # Estimate binomial model with logit link
#' Fit <- glm(formula = y ~ timeInt + age + logwage, data = UnempLong, family = binomial())
#' 
#' # Estimate discrete survival function given age, logwage of first person
#' hazard <- predict(Fit, newdata = subset(UnempLong, obj == 1), type = "response")
#' 
#' # Estimate marginal probabilities given age, logwage of first person
#' MarginalProbCondX <- estMargProb (c(hazard, 1))
#' MarginalProbCondX
#' sum(MarginalProbCondX)==1 # TRUE: Marginal probabilities must sum to 1!
#' 
#' @export estMargProb
estMargProb <- function(hazards) {
  
  # Input checks
  if(!is.vector(hazards)) {stop("Argument *hazards* is not a vector! Please specify a numeric vector of estimated hazard rates.")}
  if(!all(-sqrt(.Machine$double.eps) <= hazards & 
          hazards - 1 <= sqrt(.Machine$double.eps))) {
    stop("Argument *hazards* is not between 0 and 1! Please specify a numeric vector of probabilities.")
  }
  if(length(hazards) == 0) {stop("Argument *hazards* is empty! Please specify a numeric vector of probabilities.")}
  
  if(length(hazards) > 1) {
    EstSurv <- estSurv(hazards)
    EstProb <- c(hazards[1], sapply(2:length(hazards), function (x) hazards [x] * EstSurv [x - 1]))
    names(EstProb) <- paste("P(T=", 1:length(hazards), ")", sep = "")
  }
  else {
    EstProb <- 1
    names(EstProb) <- paste("P(T=", 1, ")", sep = "")
  }
  class(EstProb) <- "discSurvEstMargProb"
  return(EstProb)
}

Try the discSurv package in your browser

Any scripts or data that you put into this service are public.

discSurv documentation built on April 29, 2026, 9:07 a.m.