R/DiscSurvEstimationCR.R

Defines functions estCompRisksGEE H_k hessianLogitLikGEE U_k_i gradientLogLikGEE covarGEE estCumInc estMargProbCompRisks estCumHazCompRisks estSurvCompRisks estRegSubDist estForestCompRisks estTreeCompRisks estRegSmoothCompRisks estRegCompRisks estMeasuresCompRisks

Documented in covarGEE estCompRisksGEE estCumHazCompRisks estCumInc estForestCompRisks estMargProbCompRisks estMeasuresCompRisks estRegCompRisks estRegSmoothCompRisks estRegSubDist estSurvCompRisks estTreeCompRisks

#########################
# estMeasuresCompRisks

#' Fit Discrete Survival Measures Based On Cause-Specific Hazards
#' 
#' Wrapper to estimate competing risks all cause survival functions, 
#' cumulative hazards and marginal probabilities of all individuals of a given data set.
#' 
#' @param hazards Estimated discrete hazards of the events (numeric class c("matrix", "array")). 
#' Discrete hazards of each time interval are stored in the rows and the number 
#' of columns equal to the number of events.
#' @param obj Integer identification number of each individual. 
#' Usually this information is computed during data augmentation (class "numeric").
#' @return List of estimated measures (class "list"). There are three list elements with
#' following contents: \itemize{ 
#' \item {surv_a} List of estimated all cause survival curves for each individual.
#' \item {cumHaz_a} List of estimated all cause cumulative hazards for each individual.
#' \item {margProb_a} List of estimated all cause marginal probabilities for each individual. 
#' }
#' @note It is assumed that the data set was preprocessed with functions such as
#' \code{\link{dataLong}} or \code{\link{dataLongTimeDep}}.
#' @author Thomas Welchowski \email{t.welchowski@@psychologie.uzh.ch}
#' @seealso \code{\link{estSurv}}, \code{\link{estCumHaz}}, 
#' \code{\link{estMargProb}}
#' @references 
#' \insertRef{tutzModelDisc}{discSurv}
#' @keywords survival
#' @examples
#' 
#' # Load unemployment data
#' library(Ecdat)
#' data(UnempDur)
#' 
#' # Select subsample
#' subUnempDur <- UnempDur[1:100, ]
#' 
#' ######################################
#' # Estimate cause-specific hazard rates
#' 
#' estModel <- estRegSmoothCompRisks(dataShort = subUnempDur, dataTransform = "dataLongCompRisks", 
#' formulaVariable =~ timeInt + age + ui + logwage * ui, 
#' eventColumns = c("censor1", "censor2", "censor3", "censor4"), timeColumn = "spell")
#'
#' # Estimate cause-specific hazards (without censoring "e0")
#' estHaz <- predict(estModel, type="response")[, -1]
#' 
#' ######################
#' # Single event example
#' 
#' # Estimate all survival functions of a given data set
#' measures1CompRisks <- estMeasuresCompRisks(hazards=estHaz, 
#' obj=attr(estModel, "augData")$obj)
#' 
#' # All cause survival function of first individual
#' measures1CompRisks$surv_a[[1]] 
#' 
#' # All cause cumulative hazard of first individual
#' measures1CompRisks$cumHaz_a[[1]]
#' 
#' # All cause marginal probabilities of first individual
#' measures1CompRisks$margProb_a[[1]]
#' 
#' @export estMeasuresCompRisks
estMeasuresCompRisks <- function(hazards, obj) {
  # Estimate all cause hazards
  hazards_a <- rowSums(hazards)
  
  # Prepare data
  splitData <- split(hazards_a, f=obj)
  RES <- list(surv_a=NA, cumHaz_a=NA, margProb_a=NA)
  
  # Estimate survival functions
  RES$surv_a <- lapply(splitData, function(x){
    estSurv(c(x, 1))
  })
  
  # Estimate cumulative hazards
  RES$cumHaz_a <- lapply(splitData, function(x){
    estCumHaz(c(x, 1))
  })
  
  # Estimate marginal probabilities
  RES$margProb_a <- lapply(splitData, function(x){
    estMargProb(c(x, 1))
  })
  
  return(RES)
}

##################################################
# Discrete survival competing risks model fitting

#' Discrete Survival Model Fitting For Competing Risks
#' 
#' Wrapper for estimation of parametric discrete survival models for cause-specific competing risk models. 
#' Several preprocessing options such as time-dependent covariates are available and 
#' linear as well as smooth effects can be specified.
#' 
#' @param dataShort Original data set in short format with each row corresponding to one independent 
#' observation (class "data.frame").
#' @param dataTransform Specification of the data transformation function from short to long format (class "character"). 
#' There are two available options: Without time dependent covariates ("dataLongCompRisks") and with 
#' time dependent covariates ("dataLongCompRisksTimeDep"). The default is set to the former.
#' @param formulaVariable Specifies the right hand side of the regression formula (class "formula").
#' The default is to use the discrete time variable, which corresponds to a covariate free hazard. 
#' It is recommended to always include the discrete time variable "timeInt".
#' @param timeColumn Character giving the column name of the observed times. 
#' It is required that the observed times are discrete (class "integer").
#' @param eventColumns Character vector giving the column names of the event
#' indicators without censoring column (class "character"). It is required that all events are
#' binary encoded. If the sum of all event indicators is zero, then this is
#' interpreted as a censored observation. Alternatively a column name of a
#' factor representing competing events can be given. In this case the argument
#' \emph{eventColumnsAsFactor} has to be set TRUE and the first level is assumed to
#' represent censoring.
#' @param eventColumnsAsFactor Should the argument eventColumns be intepreted
#' as column name of a factor variable (class "logical")? Default is FALSE.
#' @param timeAsFactor Specifies if the computed discrete time intervals should be 
#' converted to a categorical variable (class "logical"). Default is FALSE. 
#' In the default settings the discret time intervals are treated 
#' as quantitative (class "numeric").
#' @param idColumn Name of column of identification number of persons (class "character").
#' Default is set to use function \code{\link{dataLong}}, that does not need this argument.
#' @param family Specifies the assumption about the response distribution and the link function. 
#' Default value is the discrete survival cause-specific competing risks model with 
#' multinomial logistic function (function \code{\link[VGAM]{multinomial}}).
#' @param storeAugData Should the augmented data set be saved (class "logical")? 
#' Defaults is TRUE. The data set is available as attribute "augData".
#' @param \dots Specification of additional arguments in function \code{\link[VGAM]{vgam}}.
#' @return Returns an object of class "vgam".
#' @author Thomas Welchowski \email{t.welchowski@@psychologie.uzh.ch}
#' @details Variables in argument \emph{formulaVariable} need to be separated by "+ ". 
#' For example if the two variables \emph{timeInt} and \emph{X1} should be included the formula would be
#' "~ timeInt + X1". The variable \emph{timeInt} is constructed before estimation of the model.
#' @seealso \code{\link{dataLongCompRisks}}, \code{\link{dataLongCompRisksTimeDep}}, 
#' \code{\link[VGAM]{vgam}}
#' @references 
#' \insertRef{tutzModelDisc}{discSurv} \cr\cr
#' \insertRef{schmidCompRisks}{discSurv}
#' @keywords survival
#' @examples
#' 
#' # Example with unemployment data
#' library(Ecdat)
#' data(UnempDur)
#' 
#' # Select subsample
#' SubUnempDur <- UnempDur[1:100, ]
#' 
#' # Estimate cause-specific hazard rates
#' estModel <- estRegCompRisks(dataShort = SubUnempDur, dataTransform = "dataLongCompRisks", 
#' formulaVariable =~ timeInt + age + ui + logwage * ui, 
#' eventColumns = c("censor1", "censor2", "censor3", "censor4"), timeColumn = "spell")
#' estModel
#' 
#' @export estRegCompRisks
estRegCompRisks <- function(dataShort, dataTransform = "dataLongCompRisks",
                                  formulaVariable =~ timeInt, timeColumn, 
                                  eventColumns, eventColumnsAsFactor=FALSE,
                                  timeAsFactor=FALSE, idColumn=NULL, 
                                  family=VGAM::multinomial, storeAugData=TRUE, ... ){
  
  # Data transformation
  if(dataTransform=="dataLongCompRisks"){
    datLong <- dataLongCompRisks(dataShort = dataShort, timeColumn=timeColumn, 
                                 eventColumns = eventColumns, timeAsFactor=timeAsFactor,
                                 eventColumnsAsFactor = eventColumnsAsFactor)
  }
  if(dataTransform == "dataLongCompRisksTimeDep"){
    datLong <- dataLongCompRisksTimeDep(dataSemiLong = dataShort, timeColumn=timeColumn, 
                                        eventColumns = eventColumns, 
                                        eventColumnsAsFactor = eventColumnsAsFactor,
                                        timeAsFactor=timeAsFactor, idColumn=idColumn)
  }
  
  # Construct formula
  eventColumns <- grep("e[0-9]+", names(datLong), value = TRUE)
  constructFormula <- formula(paste("cbind(", paste(eventColumns, collapse = ","), ")", 
                                    "~ ", paste(attr(terms(formulaVariable),"term.labels"), collapse=" + "), sep = " "))
  
  # Fit
  RES <- VGAM::vglm(formula=constructFormula, family=family, data=datLong, ...)
  if(storeAugData){
    attr(RES, "augData") <- datLong
  }
  return(RES)
}

##################################################
# Discrete survival competing risks model fitting

#' Discrete Survival Additive Model Fitting For Competing Risks
#' 
#' Wrapper for estimation of discrete survival generalized, additive models for cause-specific competing risk models. 
#' Several preprocessing options such as time-dependent covariates are available and 
#' linear as well as smooth effects can be specified.
#' 
#' @param dataShort Original data set in short format with each row corresponding to one independent 
#' observation (class "data.frame").
#' @param dataTransform Specification of the data transformation function from short to long format (class "character"). 
#' There are two available options: Without time dependent covariates ("dataLongCompRisks") and with 
#' time dependent covariates ("dataLongCompRisksTimeDep"). The default is set to the former.
#' @param formulaVariable Specifies the right hand side of the regression formula (class "formula").
#' The default is to use the discrete time variable, which corresponds to a covariate free hazard. 
#' It is recommended to always include the discrete time variable "timeInt".
#' @param timeColumn Character giving the column name of the observed times. 
#' It is required that the observed times are discrete (class "integer").
#' @param eventColumns Character vector giving the column names of the event
#' indicators without censoring column (class "character"). It is required that all events are
#' binary encoded. If the sum of all event indicators is zero, then this is
#' interpreted as a censored observation. Alternatively a column name of a
#' factor representing competing events can be given. In this case the argument
#' \emph{eventColumnsAsFactor} has to be set TRUE and the first level is assumed to
#' represent censoring.
#' @param eventColumnsAsFactor Should the argument eventColumns be intepreted
#' as column name of a factor variable (class "logical")? Default is FALSE.
#' @param timeAsFactor Specifies if the computed discrete time intervals should be 
#' converted to a categorical variable (class "logical"). Default is FALSE. 
#' In the default settings the discret time intervals are treated 
#' as quantitative (class "numeric").
#' @param idColumn Name of column of identification number of persons (class "character").
#' Default is set to use function \code{\link{dataLong}}, that does not need this argument.
#' @param family Specifies the assumption about the response distribution and the link function. 
#' Default value is the discrete survival cause-specific competing risks model with 
#' multinomial logistic function (function \code{\link[VGAM]{multinomial}}).
#' @param storeAugData Should the augmented data set be saved (class "logical")? 
#' Defaults is TRUE. The data set is available as attribute "augData".
#' @param \dots Specification of additional arguments in function \code{\link[VGAM]{vgam}}.
#' @return Returns an object of class "vgam".
#' @author Thomas Welchowski \email{t.welchowski@@psychologie.uzh.ch}
#' @details Variables in argument \emph{formulaVariable} need to be separated by "+ ". 
#' For example if the two variables \emph{timeInt} and \emph{X1} should be included the formula would be
#' "~ timeInt + X1". The variable \emph{timeInt} is constructed before estimation of the model.
#' @seealso \code{\link{dataLongCompRisks}}, \code{\link{dataLongCompRisksTimeDep}}, 
#' \code{\link[VGAM]{vgam}}
#' @references 
#' \insertRef{tutzModelDisc}{discSurv} \cr\cr
#' \insertRef{schmidCompRisks}{discSurv}
#' @keywords survival
#' @examples
#' 
#' # Example with unemployment data
#' library(Ecdat)
#' data(UnempDur)
#' 
#' # Select subsample
#' SubUnempDur <- UnempDur[1:100, ]
#' 
#' # Estimate cause-specific hazard rates
#' estModel <- estRegSmoothCompRisks(dataShort = SubUnempDur, dataTransform = "dataLongCompRisks", 
#' formulaVariable =~ timeInt + age + ui + logwage * ui, 
#' eventColumns = c("censor1", "censor2", "censor3", "censor4"), timeColumn = "spell")
#' estModel
#' 
#' @export estRegSmoothCompRisks
estRegSmoothCompRisks <- function(dataShort, dataTransform = "dataLongCompRisks",
                            formulaVariable =~ timeInt, timeColumn, 
                            eventColumns, eventColumnsAsFactor=FALSE,
                            timeAsFactor=FALSE, idColumn=NULL, 
                            family=VGAM::multinomial, storeAugData=TRUE, ... ){
  
  # Data transformation
  if(dataTransform=="dataLongCompRisks"){
    datLong <- dataLongCompRisks(dataShort = dataShort, timeColumn=timeColumn, 
                                 eventColumns = eventColumns, timeAsFactor=timeAsFactor)
  }
  if(dataTransform == "dataLongCompRisksTimeDep"){
    datLong <- dataLongCompRisksTimeDep(dataSemiLong = dataShort, timeColumn=timeColumn, 
                                        eventColumns = eventColumns, 
                                        eventColumnsAsFactor = eventColumnsAsFactor,
                                        timeAsFactor=timeAsFactor, idColumn=idColumn)
  }
  
  # Construct formula
  eventColumns <- grep("e[0-9]+", names(datLong), value = TRUE)
  constructFormula <- formula(paste("cbind(", paste(eventColumns, collapse = ","), ")", 
                                    "~ ", paste(attr(terms(formulaVariable),"term.labels"), collapse=" + "), sep = " "))
  
  # Fit
  RES <- VGAM::vgam(formula=constructFormula, family=family, data=datLong, ...)
  if(storeAugData){
    attr(RES, "augData") <- datLong
  }
  return(RES)
}

#' Discrete Survival Tree Fitting For Competing Risks
#' 
#' Wrapper for estimation of discrete survival tree for cause-specific competing risk models. 
#' Several preprocessing options such as time-dependent covariates are available.
#' 
#' @param dataShort Original data set in short format with each row corresponding to one independent 
#' observation (class "data.frame").
#' @param dataTransform Specification of the data transformation function from short to long format (class "character"). 
#' There are two available options: Without time dependent covariates ("dataLongCompRisks") and with 
#' time dependent covariates ("dataLongCompRisksTimeDep"). The default is set to the former.
#' @param formulaVariable Specifies the right hand side of the regression formula (class "formula").
#' The default is to use the discrete time variable, which corresponds to a covariate free hazard. 
#' It is recommended to always include the discrete time variable "timeInt".
#' @param timeColumn Character giving the column name of the observed times. 
#' It is required that the observed times are discrete (class "integer").
#' @param eventColumns Character vector giving the column names of the event
#' indicators without censoring column (class "character"). It is required that all events are
#' binary encoded. If the sum of all event indicators is zero, then this is
#' interpreted as a censored observation. Alternatively a column name of a
#' factor representing competing events can be given. In this case the argument
#' \emph{eventColumnsAsFactor} has to be set TRUE and the first level is assumed to
#' represent censoring.
#' @param eventColumnsAsFactor Should the argument eventColumns be intepreted
#' as column name of a factor variable (class "logical")? Default is FALSE.
#' @param timeAsFactor Specifies if the computed discrete time intervals should be 
#' converted to a categorical variable (class "logical"). Default is FALSE. 
#' In the default settings the discret time intervals are treated 
#' as quantitative (class "numeric").
#' @param idColumn Name of column of identification number of persons (class "character").
#' Default is set to use function \code{\link{dataLong}}, that does not need this argument.
#' @param storeAugData Should the augmented data set be saved (class "logical")? 
#' Defaults is TRUE. The data set is available as attribute "augData".
#' @param \dots Specification of additional arguments in function \code{\link[VGAM]{vgam}}.
#' @return Returns an object of class "rpart".
#' @author Thomas Welchowski \email{t.welchowski@@psychologie.uzh.ch}
#' @details Variables in argument \emph{formulaVariable} need to be separated by "+ ". 
#' For example if the two variables \emph{timeInt} and \emph{X1} should be included the formula would be
#' "~ timeInt + X1". The variable \emph{timeInt} is constructed before estimation of the model.
#' @seealso \code{\link{dataLongCompRisks}}, \code{\link{dataLongCompRisksTimeDep}}, 
#' \code{\link[rpart]{rpart}}
#' @references 
#' \insertRef{bergerClassTree}{discSurv} \cr\cr
#' \insertRef{schmidCompRisks}{discSurv} 
#' @keywords survival
#' @examples
#' 
#' # Example with unemployment data
#' library(Ecdat)
#' data(UnempDur)
#' 
#' # Select subsample
#' SubUnempDur <- UnempDur[1:100, ]
#' 
#' # Estimate GEE models for all events
#' estModel <- estRegSmoothCompRisks(dataShort = SubUnempDur, dataTransform = "dataLongCompRisks", 
#' formulaVariable =~ timeInt + age + ui + logwage * ui, 
#' eventColumns = c("censor1", "censor2", "censor3", "censor4"), timeColumn = "spell")
#' estModel
#' 
#' @export estTreeCompRisks
estTreeCompRisks <- function(dataShort, dataTransform = "dataLongCompRisks",
                                  formulaVariable =~ timeInt, timeColumn, 
                                  eventColumns, eventColumnsAsFactor=FALSE,
                                  timeAsFactor=FALSE, idColumn=NULL, 
                             storeAugData=TRUE, ... ){
  
  # Data transformation
  if(dataTransform=="dataLongCompRisks"){
    datLong <- dataLongCompRisks(dataShort = dataShort, timeColumn=timeColumn, 
                                 eventColumns = eventColumns, timeAsFactor=timeAsFactor,
                                 responseAsFactor=TRUE, 
                                 eventColumnsAsFactor=eventColumnsAsFactor)
  }
  if(dataTransform == "dataLongCompRisksTimeDep"){
    datLong <- dataLongCompRisksTimeDep(dataSemiLong = dataShort, timeColumn=timeColumn, 
                                        eventColumns = eventColumns, 
                                        eventColumnsAsFactor = eventColumnsAsFactor,
                                        timeAsFactor=timeAsFactor, idColumn=idColumn,
                                        responseAsFactor=TRUE)
  }
  
  # Construct formula
  constructFormula <- formula(paste("responses ~", paste(attr(terms(formulaVariable),"term.labels"), collapse=" + "), sep = " "))
  
  # Fit
  RES <- rpart::rpart(formula=constructFormula, data=datLong, method="class", ...)
  if(storeAugData){
    attr(RES, "augData") <- datLong
  }
  return(RES)
}


#' Discrete Survival Random Forest Fitting For Competing Risks 
#' 
#' Wrapper for estimation of discrete survival random forest for cause-specific competing risk models. 
#' Several preprocessing options such as time-dependent covariates are available.
#' 
#' @param dataShort Original data set in short format with each row corresponding to one independent 
#' observation (class "data.frame").
#' @param dataTransform Specification of the data transformation function from short to long format (class "character"). 
#' There are two available options: Without time dependent covariates ("dataLongCompRisks") and with 
#' time dependent covariates ("dataLongCompRisksTimeDep"). The default is set to the former.
#' @param formulaVariable Specifies the right hand side of the regression formula (class "formula").
#' The default is to use the discrete time variable, which corresponds to a covariate free hazard. 
#' It is recommended to always include the discrete time variable "timeInt".
#' @param timeColumn Character giving the column name of the observed times. 
#' It is required that the observed times are discrete (class "integer").
#' @param eventColumns Character vector giving the column names of the event
#' indicators without censoring column (class "character"). It is required that all events are
#' binary encoded. If the sum of all event indicators is zero, then this is
#' interpreted as a censored observation. Alternatively a column name of a
#' factor representing competing events can be given. In this case the argument
#' \emph{eventColumnsAsFactor} has to be set TRUE and the first level is assumed to
#' represent censoring.
#' @param eventColumnsAsFactor Should the argument eventColumns be intepreted
#' as column name of a factor variable (class "logical")? Default is FALSE.
#' @param timeAsFactor Specifies if the computed discrete time intervals should be 
#' converted to a categorical variable (class "logical"). Default is FALSE. 
#' In the default settings the discret time intervals are treated 
#' as quantitative (class "numeric").
#' @param idColumn Name of column of identification number of persons (class "character").
#' Default is set to use function \code{\link{dataLong}}, that does not need this argument.
#' @param storeAugData Should the augmented data set be saved (class "logical")? 
#' Defaults is TRUE. The data set is available as attribute "augData".
#' @param \dots Specification of additional arguments in function \code{\link[VGAM]{vgam}}.
#' @return Returns an object of class "ranger".
#' @author Thomas Welchowski \email{t.welchowski@@psychologie.uzh.ch}
#' @details Variables in argument \emph{formulaVariable} need to be separated by "+ ". 
#' For example if the two variables \emph{timeInt} and \emph{X1} should be included the formula would be
#' "~ timeInt + X1". The variable \emph{timeInt} is constructed before estimation of the model.
#' @seealso \code{\link{dataLongCompRisks}}, \code{\link{dataLongCompRisksTimeDep}}, 
#' \code{\link[rpart]{rpart}}
#' @references 
#' \insertRef{tutzModelDisc}{discSurv}
#' @keywords survival
#' @examples
#' 
#' # Example with unemployment data
#' library(Ecdat)
#' data(UnempDur)
#' 
#' # Select subsample
#' SubUnempDur <- UnempDur[1:100, ]
#' 
#' # Estimate GEE models for all events
#' estModel <- estRegSmoothCompRisks(dataShort = SubUnempDur, dataTransform = "dataLongCompRisks", 
#' formulaVariable =~ timeInt + age + ui + logwage * ui, 
#' eventColumns = c("censor1", "censor2", "censor3", "censor4"), timeColumn = "spell")
#' estModel
#' 
#' @export estForestCompRisks
estForestCompRisks <- function(dataShort, dataTransform = "dataLongCompRisks",
                             formulaVariable =~ timeInt, timeColumn, 
                             eventColumns, eventColumnsAsFactor=FALSE,
                             timeAsFactor=FALSE, idColumn=NULL, storeAugData=TRUE,
                             ... ){
  
  # Data transformation
  if(dataTransform=="dataLongCompRisks"){
    datLong <- dataLongCompRisks(dataShort = dataShort, timeColumn=timeColumn, 
                                 eventColumns = eventColumns, timeAsFactor=timeAsFactor,
                                 responseAsFactor=TRUE)
  }
  if(dataTransform == "dataLongCompRisksTimeDep"){
    datLong <- dataLongCompRisksTimeDep(dataSemiLong = dataShort, timeColumn=timeColumn, 
                                        eventColumns = eventColumns, 
                                        eventColumnsAsFactor = eventColumnsAsFactor,
                                        timeAsFactor=timeAsFactor, idColumn=idColumn,
                                        responseAsFactor=TRUE)
  }
  
  # Construct formula
  constructFormula <- formula(paste("responses ~", paste(attr(terms(formulaVariable),"term.labels"), collapse=" + "), sep = " "))
  
  # Fit
  RES <- ranger::ranger(formula=constructFormula, data=datLong, probability=TRUE, ...)
  if(storeAugData){
    attr(RES, "augData") <- datLong
  }
  return(RES)
}

#' Discrete Survival Subdistribution Fitting
#' 
#' Wrapper for estimation of discrete survival regression subdistribution hazard models
#' that includes data preprocessing.
#' 
#' @param dataShort Original data set in short format with each row corresponding to one independent 
#' observation (class "data.frame").
#' @param formulaVariable Specifies the right hand side of the regression formula (class "formula").
#' The default is to use the discrete time variable, which corresponds to a covariate free hazard. 
#' It is recommended to always include the discrete time variable "timeInt".
#' @param timeColumn Character giving the column name of the observed times. 
#' It is required that the observed times are discrete (class "integer").
#' @param eventColumns Character vector giving the column names of the event
#' indicators without censoring column (class "character"). It is required that all events are
#' binary encoded. If the sum of all event indicators is zero, then this is
#' interpreted as a censored observation. Alternatively a column name of a
#' factor representing competing events can be given. In this case the argument
#' \emph{eventColumnsAsFactor} has to be set TRUE and the first level is assumed to
#' represent censoring.
#' @param eventColumnsAsFactor Should the argument eventColumns be intepreted
#' as column name of a factor variable (class "logical")? Default is FALSE.
#' @param eventFocus Column name of the event of interest or type 1 event (class "character").
#' @param timeAsFactor Specifies if the computed discrete time intervals should be 
#' converted to a categorical variable (class "logical"). Default is FALSE. 
#' In the default settings the discret time intervals are treated 
#' as quantitative (class "numeric").
#' @param family Specifies the assumption about the response distribution and the link function. 
#' Default value is the discrete survival continuation ratio model with logit-link 
#' (for comparison see \code{\link[stats]{family}}).
#' @param storeAugData Should the augmented data set be saved (class "logical")? 
#' Defaults is TRUE. The data set is available as attribute "augData".
#' @param \dots Specification of additional arguments in function \code{\link[stats]{glm}}.
#' @return Returns an object of class c("glm", "lm"). The attribute "subDistWeights"
#' saves the subdistribution weights used in the estimation. 
#' @author Thomas Welchowski \email{t.welchowski@@psychologie.uzh.ch}
#' @details Variables in argument \emph{formulaVariable} need to be separated by "+ ". 
#' For example if the two variables \emph{timeInt} and \emph{X1} should be included the formula would be
#' "~ timeInt + X1". The variable \emph{timeInt} is constructed before estimation of the model.
#' @seealso \code{\link{dataLongCompRisks}}, \code{\link{estCompRisksGEE}}, 
#' \code{\link{estRegSmoothCompRisks}}, \code{\link[stats]{glm}}
#' @references 
#' \insertRef{bergerSubdist}{discSurv}
#' @keywords survival
#' @examples
#' 
#' # Example with unemployment data
#' library(Ecdat)
#' data(UnempDur)
#' 
#' # Select subsample
#' SubUnempDur <- UnempDur[1:100, ]
#' 
#' # Estimate GEE models for all events
#' estModel <- estRegSubDist(dataShort = SubUnempDur,
#' formulaVariable =~ timeInt + age + ui + logwage * ui, 
#' eventColumns = c("censor1", "censor2", "censor3", "censor4"), timeColumn = "spell",
#' eventFocus="censor1")
#' summary(estModel)
#' 
#' @export estRegSubDist
estRegSubDist <- function(dataShort,
                          formulaVariable =~ timeInt, timeColumn, 
                          eventColumns, eventColumnsAsFactor=FALSE,
                          eventFocus, timeAsFactor=FALSE, family=stats::binomial,
                          storeAugData=TRUE, ... ){
  
  # Data transformation
  datLong <- dataLongSubDist(dataShort = dataShort, timeColumn=timeColumn, 
                             eventColumns = eventColumns, timeAsFactor=timeAsFactor,
                             eventFocus=eventFocus, eventColumnsAsFactor=eventColumnsAsFactor)

  # Construct formula
  constructFormula <- formula(paste("y ~ ", paste(
    attr(terms(formulaVariable),"term.labels"), collapse=" + "), sep = " "))
  
  # Fit
  RES <- glm(formula=constructFormula, data=datLong, weights=datLong$subDistWeights, 
             family=family, ...)
  attr(RES, which="subDistWeights") <- datLong$subDistWeights
  if(storeAugData){
    attr(RES, "augData") <- datLong
  }
  return(RES)
}

#' Survival Function For Competing Risks
#' 
#' Computes the overall survival function S(T>t|x) for all causes based on 
#' estimated hazards of a competing risks model. The discrete hazards may or 
#' may not depend on covariates. The covariates have to be equal across all 
#' estimated hazards. Therefore the given discrete hazards should only vary 
#' over time.
#' 
#' The argument \emph{hazards} must be given for all intervals \eqn{\left[ a_0, a_1 \right), \left[ a_1,
#' a_2 \right), ..., \left[ a_{q-1}, a_q \right), \left[ a_q, \infty \right)}.
#' 
#' @param hazards Estimated discrete hazards (numeric class c("matrix", "array")). 
#' Discrete hazards of each time interval are stored in the rows and the number 
#' of columns equal to the number of events.
#' @return Estimated all cause survival probabilities (class "numeric")
#' @note It is assumed that all time points up to the last interval \eqn{\left[ a_q, \infty \right)}
#' are available. If not already present, these can be added manually.
#' @author Moritz Berger \email{moritz.berger@@zi-mannheim.de}
#' @seealso \code{\link{estSurv}}
#' @references 
#' \insertRef{tutzModelDisc}{discSurv}
#' @keywords competing_risks discrete_survival
#' @examples
#' 
#' # Example unemployment data
#' library(Ecdat)
#' data(UnempDur)
#' 
#' # Select subsample
#' subUnempDur <- UnempDur [1:100, ]
#' 
#' # Convert to long format
#' UnempLong <- dataLongCompRisks(dataShort = subUnempDur, timeColumn = "spell", 
#' eventColumns = c("censor1", "censor4"))
#' head(UnempLong)
#' 
#' # Estimate continuation ratio model with logit link
#' vglmFit <- VGAM::vglm(formula = cbind(e0, e1, e2) ~ timeInt + age + logwage, data = UnempLong,
#' family = VGAM::multinomial(refLevel = "e0"))
#' 
#' # Estimate discrete survival function given age, logwage of first person
#' hazards <- VGAM::predictvglm(vglmFit, newdata = subset(UnempLong, obj == 1), type = "response")[,-1]
#' SurvivalFuncCondX <- estSurvCompRisks(rbind(hazards, 0.5))
#' SurvivalFuncCondX
#' 
#' 
#' @export estSurvCompRisks
estSurvCompRisks <- function(hazards) {
  
  # Input checks
  if(!is.matrix(hazards)) {stop(
    "Argument *hazards* is not a matrix! Please specify a matrix of estimated hazards with the time intervals in the rows 
    and the events in the columns.")}
  if(!all(-sqrt(.Machine$double.eps) <= hazards & 
          hazards - 1 <= sqrt(.Machine$double.eps))) {
    stop("Argument *hazards* is not a matrix of probabilities! Please specify a matrix of estimated hazards with the 
          time intervals in the rows and the events in the columns.")}
  
  overall_haz <- apply(hazards, 1, sum)
  erg <- cumprod(1 - overall_haz)
  names(erg) <- paste("S_a(T=", 1:nrow(hazards), ")", sep = "")
  return(erg)
}

#########################
# estCumHazCompRisks

#' Cumulative Hazard Function For Competing Risks
#' 
#' Estimates the overall cumulative hazard function \eqn{\Gamma}(T = t|x) based 
#' on the overall hazard rate in competing risks. The hazard rates may or may 
#' not depend on covariates. The covariates have to be equal across all 
#' estimated hazard rates. Therefore the given hazard rates should only vary 
#' over time.
#' 
#' The argument \emph{hazards} must be given for all intervals \eqn{\left[ a_0, a_1 \right), \left[ a_1,
#' a_2 \right), ..., \left[ a_{q-1}, a_q \right), \left[ a_q, \infty \right)}.
#' 
#' @param hazards Estimated discrete hazards (numeric class c("matrix", "array")). 
#' Discrete hazards of each time interval are stored in the rows and the number 
#' of columns equal to the number of events.
#' @return Estimated cumulative all cause hazard (class "numeric")
#' @note It is assumed that all time points up to the last 
#' theoretical interval \eqn{\left[ a_q, \infty \right)} are available. If not already present, 
#' these can be added manually.
#' @author Thomas Welchowski \email{t.welchowski@@psychologie.uzh.ch}
#' @references 
#' \insertRef{tutzModelDisc}{discSurv}
#' @keywords survival
#' @examples
#' 
#' 
#' # Example unemployment data
#' library(Ecdat)
#' data(UnempDur)
#' 
#' # Select subsample
#' subUnempDur <- UnempDur[1:100, ]
#' 
#' # Estimate binomial model with logit link
#' estModel <- estRegSmoothCompRisks(dataShort = subUnempDur, 
#' dataTransform = "dataLongCompRisks", 
#' formulaVariable =~ timeInt + age + ui + logwage * ui, 
#' eventColumns = c("censor1", "censor2", "censor3", "censor4"), timeColumn = "spell")
#' 
#' # Estimate discrete survival function given age, logwage of first person
#' predFrame <- attr(estModel, "augData")
#' hazard <- predict(estModel, newdata=predFrame[predFrame$obj==1, ], type = "response")
#' cumHazCondX <- estCumHazCompRisks(cbind(hazard, 1))
#' cumHazCondX
#' 
#' @export estCumHazCompRisks
estCumHazCompRisks <- function(hazards) {
  # Input checks
  if(!is.matrix(hazards)) {stop(
    "Argument *hazards* is not a matrix! Please specify a matrix of estimated hazards with the time intervals in the rows 
    and the events in the columns.")}
  if(!all(-sqrt(.Machine$double.eps) <= hazards & 
          hazards - 1 <= sqrt(.Machine$double.eps))) {
    stop("Argument *hazards* is not a matrix of probabilities! Please specify a matrix of estimated hazards with the 
          time intervals in the rows and the events in the columns.")}
  
  overall_haz <- apply(hazards, 1, sum)
  erg <- cumsum(overall_haz)
  names(erg) <- paste("CumHaz_a(T=", 1:nrow(hazards), ")", sep = "")
  return(erg)
}

#######################################################
#' Marginal Probabilities For Competing Risks
#' 
#' Estimates the marginal probability P(T = t, R = r|x) based on estimated 
#' discrete hazards of a competing risks model. The discrete hazards may or may 
#' not depend on covariates. The covariates have to be equal across all 
#' estimated discrete hazards. Therefore the given discrete hazards should only 
#' vary over time.
#' 
#' The argument \emph{hazards} must be given for all intervals \eqn{\left[ a_0, a_1 \right), \left[ a_1,
#' a_2 \right), ..., \left[ a_{q-1}, a_q \right), \left[ a_q, \infty \right)}.
#' 
#' @param hazards Estimated discrete hazards ("numeric matrix"). 
#' Discrete hazards of each time interval are stored in the rows and the number of columns equal to the number of events.
#' @return Estimated marginal probabilities ("numeric matrix")
#' @note It is assumed that all time points up to the last interval \eqn{\left[ a_q, \infty \right)}
#' are available. If not already present, these can be added manually. 
#' In competing risk settings the marginal probabilities of the last theoretical interval 
#' depend on the assumptions on the discrete hazards in the last theoretical interval. 
#' However the estimation of all previous discrete intervals is not affected by those assumptions.
#' @author Moritz Berger \email{moritz.berger@@zi-mannheim.de}
#' @seealso \code{\link{estMargProb}}
#' @references 
#' \insertRef{tutzModelDisc}{discSurv}
#' @keywords competing risks discrete survival
#' @examples
#' 
#' # Example unemployment data
#' library(Ecdat)
#' data(UnempDur)
#' 
#' # Select subsample
#' subUnempDur <- UnempDur [1:100, ]
#' 
#' # Convert to long format
#' UnempLong <- dataLongCompRisks(dataShort = subUnempDur, timeColumn = "spell", 
#' eventColumns = c("censor1", "censor4"))
#' head(UnempLong)
#' 
#' # Estimate continuation ratio model with logit link
#' vglmFit <- VGAM::vglm(formula = cbind(e0, e1, e2) ~ timeInt + age + logwage, data = UnempLong,
#' family = VGAM::multinomial(refLevel = "e0"))
#' 
#' # Estimate discrete survival function given age, logwage of first person
#' hazards <- VGAM::predictvglm(vglmFit, newdata = subset(UnempLong, obj == 1), type = "response")[,-1]
#' 
#' # Estimate marginal probabilities given age, logwage of first person
#' # Example 1
#' # Assumption: Discrete hazards in last theoretical interval are equal for both event types
#' MarginalProbCondX <- estMargProbCompRisks(rbind(hazards, 0.5))
#' MarginalProbCondX
#' all.equal(sum(MarginalProbCondX), 1) # TRUE: Marginal probabilities must sum to 1!
#' 
#' # Example 2
#' # Assumption: Discrete hazards in last theoretical interval are event1=, event2=
#' MarginalProbCondX2 <- estMargProbCompRisks(rbind(hazards, c(0.75, 0.25)))
#' MarginalProbCondX2
#' all.equal(sum(MarginalProbCondX2), 1) # TRUE: Marginal probabilities must sum to 1!
#' 
#' # Compare marginal probabilities given X
#' all.equal(MarginalProbCondX[1:5, ], MarginalProbCondX2[1:5, ])
#' all.equal(MarginalProbCondX[6, ], MarginalProbCondX2[6, ])
#' 
#' @export estMargProbCompRisks
estMargProbCompRisks <- function(hazards) {
  
  # Input checks
  if(!is.matrix(hazards)) {stop(
    "Argument *hazards* is not a matrix! Please specify a matrix of estimated hazards with the time intervals in the rows 
    and the events in the columns.")}
  if(!all(-sqrt(.Machine$double.eps) <= hazards & 
          hazards - 1 <= sqrt(.Machine$double.eps))) {
    stop("Argument *hazards* is not a matrix of probabilities! Please specify a matrix of estimated hazards with the 
         time intervals in the rows and the events in the columns.")}
  
  EstSurv <- estSurvCompRisks(hazards)
  EstProb <- matrix(NA, nrow=nrow(hazards), ncol = ncol(hazards))
  for(j in 1:ncol(hazards)){
    EstProb[,j] <- hazards[,j]*c(1,EstSurv[1:(nrow(hazards)-1)])
  }
  rownames(EstProb) <- paste("P_a(T=", 1:nrow(hazards), ")", sep = "")
  colnames(EstProb) <- colnames(hazards)
  
  return(EstProb)
}

###############################################################################

#' Cumulative Incidence Function For Competing Risks
#' 
#' Estimates the cumulative incidence function of a discrete time competing risks model 
#' given covariates P(T <= t, event = k | x). 
#' 
#' @param hazards Estimated discrete hazard rates of all events (class "matrix"). 
#' Each column represents one event. The first column is assumed to contain the censoring case 
#' and the discrete hazards should only vary over time in each row. 
#' @param eventFocus Column that represent the discrete hazards of the primary event (class "integer").
#' @param obj Integer identification number of each individual. 
#' Usually this information is computed during data augmentation (class "numeric").
#' @return Returns cumulative incidence function of the primary event. If
#' argument \emph{obj} is not empty a list of vectors is returned.
#' @author Thomas Welchowski \email{t.welchowski@@psychologie.uzh.ch}
#' @details The covariates set is required to be constant across rows.
#' If argument \emph{obj} is given, then for each unique individual with specific 
#' covariates a cumulative incidence function is computed.
#' @seealso \code{\link{estCompRisksGEE}}, \code{\link{dataLongCompRisks}}, 
#' \code{\link{dataLongCompRisksTimeDep}}, \code{\link[geepack]{geeglm}}
#' @references 
#' \insertRef{minjungDiscComp}{discSurv}
#' @keywords survival
#' @examples
#' 
#' # Example with unemployment data
#' library(Ecdat)
#' data(UnempDur)
#' 
#' # Select subsample
#' SubUnempDur <- UnempDur [1:100, ]
#' 
#' # Estimate GEE models for all events
#' estGEE <- estCompRisksGEE(dataShort = SubUnempDur, dataTransform = "dataLongCompRisks", 
#' corstr = "independence", formulaVariable =~ timeInt + age + ui + logwage * ui, 
#' eventColumns = c("censor1", "censor2", "censor3", "censor4"), timeColumn = "spell")
#' 
#' # Estimate hazards of all events given the covariates of third person
#' SubUnempDurLong <- dataLongCompRisks(dataShort = SubUnempDur, 
#' eventColumns = c("censor1", "censor2", "censor3", "censor4"), timeColumn = "spell") 
#' preds <- predict(estGEE, subset(SubUnempDurLong, obj == 3))
#' 
#' # Estimate cumulative incidence function
#' cumIncGEE <- estCumInc(preds, eventFocus = 2)
#' plot(cumIncGEE)
#' 
#' @export estCumInc
estCumInc <- function(hazards, eventFocus=1, obj=NULL){
  
  if( is.null(obj) ){
    
    if( is.null(dim(hazards)) ){
      hazards <- matrix(hazards, ncol=1)
      eventFocus <- 1
    } 
    
    # Estimate cumulative incidence function
    allCauseHazard <- rowSums(hazards)
    P_T_k <- hazards[, eventFocus] * 
      c(1, estSurv(c(allCauseHazard, 1))[1:(dim(hazards)[1] - 1)] )
    cumInc1 <- cumsum(P_T_k)
    names(cumInc1) <- paste("P(T<=", 1:length(cumInc1), ", event)", sep = "")
    class(cumInc1) <- "discSurvEstCumInc"
    return(cumInc1)
    
  } else{
    
    if( is.null(dim(hazards)) ){
      hazards <- matrix(hazards, ncol=1)
      eventFocus <- 1
    } 
    
    # Split hazards
    splitData <- split(hazards, f=obj)
   
    # Estimate survival functions
    RES <- lapply(splitData, function(x){
      x1 <- matrix(x, ncol=ncol(hazards))
      allCauseHazard <- rowSums(x1)
      P_T_k <- x1[, eventFocus] * 
        c(1, estSurv(c(allCauseHazard, 1))[1:(dim(x1)[1] - 1)] )
      cumInc1 <- cumsum(P_T_k)
      names(cumInc1) <- paste("P(T<=", 1:length(cumInc1), 
                              ", event)", sep = "")
      class(cumInc1) <- "discSurvEstCumInc"
      cumInc1
    })
    
    return(RES)
  }
}

#########################################################
# Common covariance of parameter estimates between events

#' GEE Covariance Of All Events For Competing Risks
#' 
#' @description Estimates covariance of estimated parameters of all competing events generalized estimation equation models using sandwich approach.
#' @param modelEst Discrete time competing risks GEE model prediction model (class "dCRGEE").
#' @param printProgess Should progress status be printed during estimation (class "logical")? Default is FALSE. 
#' @return Returns symmetric matrix of rows and columns dimension "number of competing risks" * "number of regression parameters" ("numeric matrix").
#' @author Thomas Welchowski \email{t.welchowski@@psychologie.uzh.ch}
#' @seealso \code{\link{estCompRisksGEE}}, \code{\link{dataLongCompRisks}}, \code{\link{dataLongCompRisksTimeDep}}, 
#' \code{\link[geepack]{geeglm}}
#' @references 
#' \insertRef{minjungDiscComp}{discSurv}
#' @keywords survival
#' @examples
#' 
#' # Example with unemployment data
#' library(Ecdat)
#' data(UnempDur)
#' 
#' # Select subsample
#' SubUnempDur <- UnempDur [1:100, ]
#' 
#' # Estimate GEE models for all events
#' estGEE <- estCompRisksGEE(dataShort = SubUnempDur, dataTransform = "dataLongCompRisks", 
#' corstr = "independence", formulaVariable =~ timeInt + age + ui + logwage * ui, 
#' eventColumns = c("censor1", "censor2"), timeColumn = "spell")
#' 
#' \dontrun{
#' # Estimate covariance matrix of estimated parameters and competing events
#' estCovar <- covarGEE(modelEst=estGEE)
#' estCovar
#' 
#' # Covariances of estimated parameters of one event equal the diagonal blocks
#' lengthParameters <- length(estGEE[[1]]$coefficients)
#' noCompEvents <- length(estGEE)
#' meanAbsError <- rep(NA, noCompEvents)
#' for( k in 1:noCompEvents ){
#'   
#'   relInd <- (1 + (k-1) * lengthParameters) : (k * lengthParameters)
#'   meanAbsError[k] <- mean(abs(estCovar[relInd, relInd] - estGEE[[k]]$robust.variance))
#'   
#' }
#' mean(meanAbsError) 
#' # -> Covariance estimates within each event are equal to diagonal blocks in 
#' # complete covariance matrix with very small differences due to numerical accuracy.
#' }
#' 
#' @export covarGEE
covarGEE <- function(modelEst, printProgess=FALSE){

  # 1. Matrix A
  nObs <- max(modelEst[[1]]$id)
  k_max <- length(modelEst)
  pMax <- length(modelEst[[1]]$coefficients)
  matA <- matrix(0, nrow=k_max*pMax, ncol=k_max*pMax)
  for( k in 1:k_max ){
    
    tempMat <- H_k(k=k, modelEst=modelEst)
    relInd <- ( 1 + (k-1) * pMax):(k * pMax)
    matA[relInd, relInd] <- solve(tempMat)
    
    if(printProgess){
      if( k %% round(sqrt(k_max)) == 0 ){
        cat("Matrix 1 of 2 progress:", round(k/k_max*100, 2), "%", "\n")
      }
    }
    
  }
  
  # 2. Matrix B
  matBsum <- matrix(0, nrow=k_max * pMax, ncol=k_max * pMax)
  for( i in 1:nObs ){
    
    uVec <- rep(NA, k_max * pMax)
    for( k in 1:k_max ){
      
      relInd <- ( 1 + (k-1) * pMax):(k * pMax)
      uVec[relInd] <- U_k_i(k=k, i=i, modelEst=modelEst)
      
    }
    
    matBsum <- matBsum + outer(uVec, uVec)
    
    if(printProgess){
      if( i %% round(sqrt(nObs)) == 0 ){
        cat("Matrix 2 of 2 progress:", round(i/nObs*100, 2), "%", "\n")
      }
    }
    
  }
  matB <- matBsum / nObs
  
  # 3. Output
  finalMat <- matA %*% matB %*% matA / nObs
  namesTemp <- c(sapply(1:k_max, function(x) paste(
    names(modelEst)[[x]], "_", names(modelEst[[x]]$coefficients), sep="")))
  dimnames(finalMat) <- list(namesTemp, namesTemp)
  return(finalMat)
  
}

################################################################################
# Internal GEE functions

#' @rdname estCompRisksGEE
#' @param k Competing cause k (class "integer")
#' @param i Cluster number i (class "integer"), e. g. patients or hospitals
#' @param l Discrete time interval l (class "integer")
#' @param modelEst Discrete time competing risks GEE model prediction model (class "dCRGEE").
# #' @author Thomas Welchowski
#' @keywords survival
#' @noRd
gradientLogLikGEE <- function(k, i, l, modelEst){
  
  currentModel <- modelEst[[k]]
  
  exp_eta <- exp(currentModel$linear.predictors)
  datLong <- attr(currentModel, "datLong")
  modelMat <- model.matrix(currentModel$terms, datLong)
  dimModelMat <- dim(modelMat)
  
  gradAllObs <- matrix(NA, nrow=dimModelMat[1], ncol=dimModelMat[2])
  for( j in 1:dim(modelMat)[2] ){
    
    varInterest <- modelMat[, j]
    
    gradAllObs[, j] <- currentModel$y * varInterest / ( 1 + exp_eta ) - 
      (1-currentModel$y) * varInterest * exp_eta / ( 1 + exp_eta )
    
    
  }
  
  dimnames(gradAllObs)[[2]] <- dimnames(modelMat)[[2]]
  gradAllObsSub <- gradAllObs[datLong$obj==i & datLong$timeInt==l, ]
  
  gradAllObsSub
}

#' @rdname estCompRisksGEE
#' @param k Competing event k (class "integer")
#' @param i Cluster number i (class "integer"), e. g. patients or hospitals
#' @param modelEst Discrete time competing risks GEE model prediction model (class "dCRGEE").
# #' @author Thomas Welchowski
#' @keywords survival
#' @noRd
U_k_i <- function(k, i, modelEst){
  
  currentModel <- modelEst[[k]]
  datLong <- attr(currentModel, "datLong")
  modelMat <- model.matrix(currentModel$terms, datLong)
  n_i <- max(datLong[currentModel$id==i, "timeInt"])
  rowSums(sapply(1:n_i, function(m) gradientLogLikGEE(k=k, i=i, l=m, modelEst=modelEst)))
  
}

#' @rdname estCompRisksGEE
#' @param k Competing event k (class "integer")
#' @param i Cluster number i (class "integer"), e. g. patients or hospitals
#' @param l Discrete time interval l (class "integer")
#' @param modelEst Discrete time competing risks GEE model prediction model (class "dCRGEE").
# #' @author Thomas Welchowski
#' @keywords survival
#' @noRd
hessianLogitLikGEE <- function(k, i, l, modelEst){
  
  currentModel <- modelEst[[k]]
  datLong <- attr(currentModel, "datLong")
  
  exp_eta <- exp(currentModel$linear.predictors)
  modelMat <- model.matrix(currentModel$terms, datLong)
  dimModelMat <- dim(modelMat)
  
  restrictObs <- currentModel$id==i & datLong$timeInt==l
  hessMat <- matrix(NA, nrow=dimModelMat[2], ncol=dimModelMat[2])
  for( j in 1:dim(modelMat)[2] ){
    for( m in 1:dim(modelMat)[2] ){
      
      varInterest1 <- modelMat[restrictObs, j]
      varInterest2 <- modelMat[restrictObs, m]
      
      hessMat[m, j] <- - varInterest1 * varInterest2 * exp_eta[restrictObs] / 
        ( 1 + exp_eta[restrictObs] )^2
      
    }
  }
  
  dimnames(hessMat)[[1]] <- dimnames(modelMat)[[2]]
  dimnames(hessMat)[[2]] <- dimnames(modelMat)[[2]]
  hessMat
  
}

#' @rdname estCompRisksGEE
#' @param k Competing event k (class "integer")
#' @param modelEst Discrete time competing risks GEE model prediction model (class "dCRGEE").
# #' @author Thomas Welchowski
#' @keywords survival
#' @noRd
H_k <- function(k, modelEst){
  
  currentModel <- modelEst[[k]]
  datLong <- attr(currentModel, "datLong")
  modelMat <- model.matrix(currentModel$terms, datLong)
  nObs <- max(currentModel$id)
  
  matListSumObs <- vector("list", nObs)
  for( u in 1:nObs ){
    
    n_i <- max(datLong[currentModel$id==u, "timeInt"])
    matListSum <- vector("list", n_i)
    for( v in 1:n_i ){
      
      if(v==1){
        
        matListSum[[v]] <- hessianLogitLikGEE(k=k, i=u, l=v, modelEst=modelEst)
        
      } else{
        
        matListSum[[v]] <- matListSum[[v-1]] + hessianLogitLikGEE(k=k, i=u, l=v, modelEst=modelEst)
        
      }
      
    }
    
    if(u==1){
      
      matListSumObs[[u]] <- matListSum[[n_i]]
      
    } else{
      
      matListSumObs[[u]] <- matListSumObs[[u-1]] + matListSum[[n_i]]
      
    }
    
    
    
  }
  
  - matListSumObs[[nObs]] / nObs
  
}

###############################################################################

#' @rdname estCompRisksGEE
#' @param object Discrete time competing risks GEE model prediction model (class "dCRGEE").
# #' @author Thomas Welchowski
#' @param newdata New data set to be used for prediction (class "data.frame").
#' @keywords survival
#' @method predict dCRGEE
#' @export
predict.dCRGEE <- function (object, newdata, ...) {
  
  predMat <- matrix(NA, nrow = dim(newdata)[1], ncol = length(object))
  for(k in 1:length(object)){
    
    # Linear predictor
    fittedVal1 <- c(model.matrix(object[[k]]$terms, newdata) %*% 
                       object[[k]]$coefficients)
    
    # Predictions
    predMat[, k] <- exp(fittedVal1) / ( 1 + exp(fittedVal1) )
    
  }
  
  # Add probability of censoring
  predMat <- cbind(1 - rowSums(predMat), predMat)
  dimnames(predMat)[[2]] <- c("e0", names(object))
  return(predMat)
}

########################################
# GEE model for discrete competing risks

#' GEE Model Fitting For Competing Risks
#' 
#' Estimates generalized estimation equation model for each competing event separately. 
#' Dependence within person IDs is accounted for by assuming a working covariance structure.
#' 
#' @param dataShort Original data set in short format with each row corresponding to one independent 
#' observation (class "data.frame").
#' @param dataTransform Specification of the data transformation function from short to long format (class "character"). 
#' There are two available options: Without time dependent covariates ("dataLongCompRisks") and with 
#' time dependent covariates ("dataLongCompRisksTimeDep"). The default is set to the former.
#' @param corstr Assumption of correlation structure (class "character"). The following are 
#' permitted: '"independence"', '"exchangeable"', '"ar1"', '"unstructured"' and '"userdefined".
#' @param formulaVariable Specifies the right hand side of the regression formula (class "formula").
#' The default is to use the discrete time variable, which corresponds to a covariate free hazard. 
#' It is recommended to always include the discrete time variable "timeInt".
#' @param storeAugData Should the augmented data set be saved (class "logical")? 
#' Defaults is TRUE. The data set is available as attribute "augData".
#' @param \dots Additional arguments to data transformation 
#' functions \code{\link{dataLongCompRisks}} or \code{\link{dataLongCompRisksTimeDep}}.
#' Preprocessing function argument \emph{responseAsFactor} has to be set to FALSE (Default).
#' @return Returns an object of class "geeglm".
#' @author Thomas Welchowski \email{t.welchowski@@psychologie.uzh.ch}
#' @details Variables in argument \emph{formulaVariable} need to be separated by "+ ". 
#' For example if the two variables \emph{timeInt} and \emph{X1} should be included the formula would be
#' "~ timeInt + X1". The variable \emph{timeInt} is constructed before estimation of the model.
#' @seealso \code{\link{covarGEE}}, \code{\link{dataLongCompRisks}}, \code{\link{dataLongCompRisksTimeDep}}, 
#' \code{\link[geepack]{geeglm}}
#' @references 
#' \insertRef{minjungDiscComp}{discSurv}
#' @keywords survival
#' @examples
#' 
#' # Example with unemployment data
#' library(Ecdat)
#' data(UnempDur)
#' 
#' # Select subsample
#' SubUnempDur <- UnempDur [1:100, ]
#' 
#' # Estimate GEE models for all events
#' estGEE <- estCompRisksGEE(dataShort = SubUnempDur, dataTransform = "dataLongCompRisks", 
#' corstr = "independence", formulaVariable =~ timeInt + age + ui + logwage * ui, 
#' eventColumns = c("censor1", "censor2", "censor3", "censor4"), timeColumn = "spell")
#' names(estGEE)
#' estGEE[[1]]
#' 
#' # Predictions
#' SubUnempDurLong <- dataLongCompRisks(dataShort = SubUnempDur, 
#' eventColumns = c("censor1", "censor2", "censor3", "censor4"), timeColumn = "spell")
#' preds <- predict(estGEE, newdata = SubUnempDurLong)
#' head(preds)
#' 
#' @export estCompRisksGEE
estCompRisksGEE <- function(dataShort, dataTransform = "dataLongCompRisks", 
                            corstr = "independence", 
                            formulaVariable =~ timeInt, storeAugData=TRUE, ... ){
  
  # Data transformation
  if(dataTransform=="dataLongCompRisks"){
    datLong <- dataLongCompRisks(dataShort = dataShort, responseAsFactor=FALSE, ... )
  }
  if(dataTransform == "dataLongCompRisksTimeDep"){
    datLong <- dataLongCompRisksTimeDep(dataSemiLong = dataShort, responseAsFactor = FALSE, ... )
  }
  
  # Construct formula
  eventColumns <- grep("e[0-9]+", names(datLong), value = TRUE)
  # Delete censoring column
  eventColumns <- eventColumns[-which(eventColumns == "e0")]
  eventColumns_length <- length(eventColumns)
  constructFormula <- sapply(1:eventColumns_length, 
                             function(x) formula(paste(paste(
                               eventColumns[x], collapse = ","), 
                               "~ ", paste(attr(terms(formulaVariable),
                                               "term.labels"), collapse=" + "), 
                               sep = " ")))
  
  # Fit GEE
  RES <- vector(mode = "list", length = eventColumns_length)
  names(RES) <- paste(eventColumns)
  for(k in 1:eventColumns_length){
    
    # Package geepack
    # RES[[k]] <- geepack::geeglm(formula = constructFormula[[k]], 
    #                             family = stats::binomial, 
    #                     data = datLong, corstr = corstr, id = datLong$obj)
    
    # Package gee
    RES[[k]] <- gee::gee(formula = constructFormula[[k]],
                                family = stats::binomial,
                        data = datLong, corstr = corstr, id = datLong$obj)
    attr(RES[[k]], "datLong") <- datLong
    
  }
  class(RES) <- "dCRGEE"
  if(storeAugData){
    attr(RES, "augData") <- datLong
  }
  return(RES)
}

Try the discSurv package in your browser

Any scripts or data that you put into this service are public.

discSurv documentation built on April 29, 2026, 9:07 a.m.