Nothing
##
## intsurv: Integrative Survival Models
## Copyright (C) 2017-2021 Wenjie Wang <wang@wwenjie.org>
##
## This file is part of the R package intsurv.
##
## The R package intsurv is free software: You can redistribute it and/or
## modify it under the terms of the GNU General Public License as published by
## the Free Software Foundation, either version 3 of the License, or any later
## version (at your option). See the GNU General Public License at
## <https://www.gnu.org/licenses/> for details.
##
## The R package intsurv is distributed in the hope that it will be useful,
## but WITHOUT ANY WARRANTY without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
##
## collation after class.R
##' @include class.R
NULL
##' Integrative Cox Model for Uncertain Event Times
##'
##' Fit an integrative Cox model proposed by Wang et al. (2020) for
##' right-censored survival data with uncertain event times due to imperfect
##' data integration.
##'
##' @param formula \code{Survi} object specifying the covariates and response
##' variable in the model, such as \code{Survi(ID, time, event) ~ x1 + x2}.
##' @param data An optional data frame, list, or environment that contains the
##' covariates and response variables included in the model. If not found in
##' data, the variables are taken from \code{environment(formula)}, usually
##' the environment from which this function is called.
##' @param subset An optional logical vector specifying a subset of observations
##' to be used in the fitting process.
##' @param na.action An optional function that indicates what should the
##' procedure do if the data contains \code{NA}s. The default is set by the
##' na.action setting of \code{\link[base]{options}}. The "factory-fresh"
##' default is \code{\link[stats]{na.omit}}. Other possible values include
##' \code{\link[stats]{na.fail}}, \code{\link[stats]{na.exclude}}, and
##' \code{\link[stats]{na.pass}}. \code{help(na.fail)} for details.
##' @param contrasts An optional list, whose entries are values (numeric
##' matrices or character strings naming functions) to be used as
##' replacement values for the contrasts replacement function and whose
##' names are the names of columns of data containing factors. See
##' \code{contrasts.arg} of \code{\link[stats]{model.matrix.default}} for
##' details.
##' @param start A list returned by function \code{\link{iCoxph.start}}
##' specifying starting values of the parameters to be estimated in the
##' model. Please refer to the arguments of \code{\link{iCoxph.start}} for
##' the available parameters.
##' @param control A list returned by function \code{\link{iCoxph.control}}
##' specifying control parameters for the model estimation procedure.
##' Please refer to the arguments of \code{\link{iCoxph.control}} for the
##' available parameters.
##' @param ... Other arguments for future usage. A warning will be thrown if
##' any invalid argument is specified.
##'
##' @return
##' An \code{\link{iCoxph-class}} object, whose slots include
##' \itemize{
##' \item \code{call}: Function call.
##' \item \code{formula}: Formula used in the model fitting.
##' \item \code{data}: A processed data frame used for model fitting.
##' \item \code{nObs}: Number of observation.
##' \item \code{estimates}:
##' \itemize{
##' \item \code{beta}: Coefficient estimates.
##' \item \code{pi}: Estimated parameters in prior multinomial
##' distribution indicating the probabilities of corresponding
##' record being true.
##' \item \code{baseline}: A data frame that contains estimated
##' baseline hazard function with columns named \code{time} and
##' \code{h0}.
##' }
##' \item \code{start}: The initial guesses \code{beta_mat} and
##' \code{pi_mat} specified for the parameters to be estimated,
##' and the set of initial guess \code{beta_start} and \code{pi_start}
##' that resulted in the largest objective function, i.e.,
##' the observed-data likelihood function.
##' \item \code{control}: The control list specified for model fitting.
##' \item \code{na.action}: The procedure specified to deal with
##' missing values in the covariate.
##' \item \code{xlevels}: A list that records the levels in
##' each factor variable.
##' \item \code{contrasts}: Contrasts specified and used for each
##' factor variable.
##' \item \code{convergeCode}: \code{code} returned by function
##' \code{\link[stats]{nlm}}, which is an integer indicating why the
##' optimization process terminated. \code{help(nlm)} for details.
##' \item \code{logL}: A numeric vector containing the observed-data
##' log-likelihood over iterations.
##' }
##'
##' @references
##'
##' Wang, W., Aseltine, R. H., Chen, K., & Yan, J. (2020). Integrative Survival
##' Analysis with Uncertain Event Times in Application to A Suicide Risk
##' Study. \emph{Annals of Applied Statistics}, 14(1), 51--73.
##'
##' @example inst/examples/iCoxph.R
##'
##' @seealso
##' \code{\link{iCoxph.start}} and \code{\link{iCoxph.control}}, respectively,
##' for starting and controlling iCoxph fitting;
##' \code{\link{summary,iCoxph-method}} for summary of fitted model;
##' \code{\link{coef,iCoxph-method}} for estimated covariate coefficients;
##' \code{\link{bootSe}} for SE estimates from bootstrap methods.
##'
##' @importFrom stats na.fail na.omit na.exclude na.pass .getXlevels
##' model.extract model.frame model.matrix nlm pnorm
##'
##' @export
iCoxph <- function(formula, data, subset, na.action, contrasts = NULL,
start = iCoxph.start(), control = iCoxph.control(), ...)
{
## record the function call to return
Call <- match.call()
## warning on `...`
warn_dots(...)
## arguments check
if (missing(formula))
stop("Argument 'formula' is required.")
if (missing(data))
data <- environment(formula)
dat0 <- with(data, eval(formula[[2L]]))
if (! is_Survi(dat0))
stop("The formula response must be a 'Survi' object.")
Call$formula <- formula
## Prepare data: ID, time, event ~ X(s)
mcall <- match.call(expand.dots = FALSE)
mmcall <- match(c("formula", "data", "subset", "na.action"),
names(mcall), 0L)
mcall <- mcall[c(1L, mmcall)]
## drop unused levels in factors
mcall$drop.unused.levels <- TRUE
mcall[[1L]] <- quote(stats::model.frame)
mf <- eval(mcall, parent.frame())
mt <- attr(mf, "terms")
mm <- stats::model.matrix(formula, mf, contrasts.arg = contrasts)
## number of covariates excluding intercept
if ((nBeta <- ncol(mm) - 1L) <= 0)
stop("Covariates must be specified in formula.")
## covariates' names
covar_names <- colnames(mm)[- 1L]
## data
dat <- as.data.frame(cbind(mf[, 1L], mm[, - 1L]))
colnames(dat) <- c("ID", "time", "event", covar_names)
nObs <- nrow(dat)
nBeta <- ncol(dat) - 3L
## prepare start' values
if (! is_iCoxph.start(start)) {
stop("The 'start' must be specified through ",
"function 'iCoxph.start()'")
}
startList <- c(start, list(nBeta_ = nBeta, dat_ = dat))
start <- unclass(do.call("iCoxph_start", startList))
## prepare 'control' parameters
if (! is_iCoxph.control(control)) {
stop("The 'control' must be specified through ",
"function 'iCoxph.control()'")
}
control <- unclass(control)
## sort by time and ID
incDat <- dat[(orderInc <- with(dat, order(time, ID))), ]
## indicator for subjects having multiple record
incDat$dupIdx <- with(incDat, ID %in% unique(ID[duplicated(ID)]))
## define some variables for ease of computing
incDat$eventIdx <- incDat$event == 1L
dupVec <- duplicated(incDat$time)
tied <- any(dupVec)
incDat$firstIdx <- ! dupVec
h0Dat <- h_cDat <- data.frame(time = incDat$time[incDat$firstIdx])
xMat <- as.matrix(incDat[, 4L : (3L + nBeta)])
piMat <- start$pi_mat[orderInc, , drop = FALSE]
## initialize log-likelihood
logL_max0 <- - Inf
n_beta_start <- ncol(start$beta_mat)
n_pi_start <- ncol(piMat)
logL_max_vec <- rep(NA, n_beta_start * n_pi_start)
beta_est_mat <- matrix(NA, nrow = n_beta_start * n_pi_start, ncol = nBeta)
for (oneBeta in seq_len(n_beta_start)) {
for (onePi in seq_len(n_pi_start)) {
## index of starts
idx_start <- 1
incDat$piVec <- piVec <- piMat[, onePi]
## trace the log-likelihood for observed data
logL <- rep(NA, control$ecm_max_iter)
## trace beta estimates from each iteration of ECM
betaMat <- matrix(NA, nrow = control$ecm_max_iter + 1L,
ncol = nBeta)
betaMat[1L, ] <- start$beta_mat[, oneBeta]
tol_update <- sqrt(control$tol_beta)
for (iter in seq_len(control$ecm_max_iter)) {
oneFit <- ic_oneECMstep(betaHat = betaMat[iter, ],
h0Dat = h0Dat,
h_cDat = h_cDat,
dat = incDat,
xMat = xMat,
tied = tied,
control = control)
## log likehood
logL[iter] <- oneFit$logL
## update pi estimates
incDat$piVec <- oneFit$piVec
## update beta estimates
betaEst <- oneFit$betaEst
betaMat[iter + 1L, ] <- betaEst$estimate
tol_beta <- L2norm(betaMat[iter + 1L, ] - betaMat[iter, ]) /
L2norm(betaMat[iter + 1L, ] + betaMat[iter, ])
tol_pi <- L2norm(oneFit$piVec - incDat$piVec) /
L2norm(oneFit$piVec + incDat$piVec)
betaHat <- betaEst$estimate
if (tol_beta < control$tol_beta &&
tol_pi < control$tol_pi) {
break
} else if (iter == control$ecm_max_iter) {
warning("Reached the maximum number of ECM iterations.")
}
}
## keep the one fit maximizing observed log likelihood
logL <- rmNA(logL)
iter <- (oneBeta - 1) * n_pi_start + onePi
n_step <- length(logL)
logL_max <- logL_max_vec[idx_start] <- logL[n_step]
beta_est_mat[iter, ] <- betaHat
if (logL_max > logL_max0) {
logL_max0 <- logL_max
logL0 <- logL
oneFit0 <- oneFit
betaMat0 <- betaMat
piVec0 <- piVec
start$beta_start <- betaMat[1L, ]
}
## update index of start
idx_start <- idx_start + 1
}
}
## clean-up NA's
betaMat0 <- rmNA(betaMat0)
## prepare for outputs
piEst <- oneFit0$piVec[(reOrderIdx <- order(orderInc))]
start$pi_start <- piVec0[reOrderIdx]
## start$logL_end <- logL_max_vec
## start$beta_mat_end <- t(beta_est_mat)
## update results
h0Dat$h0Vec <- oneFit0$h0Vec
colnames(h0Dat) <- c("time", "h0")
h_cDat$h_cVec <- oneFit0$h_cVec
incDat$xExp <- oneFit0$xExp
incDat$piVec <- oneFit0$piVec
betaEst <- oneFit0$betaEst
betaHat <- as.numeric(betaEst$estimate)
## estimates for beta
est_beta <- matrix(NA, nrow = nBeta, ncol = 5L)
colnames(est_beta) <- c("coef", "exp(coef)", "se(coef)", "z", "Pr(>|z|)")
rownames(est_beta) <- covar_names
est_beta[, "coef"] <- betaHat
est_beta[, "exp(coef)"] <- exp(est_beta[, "coef"])
est_beta[, "se(coef)"] <- NA_real_
est_beta[, "z"] <- est_beta[, "coef"] / est_beta[, "se(coef)"]
est_beta[, "Pr(>|z|)"] <- 2 * stats::pnorm(- abs(est_beta[, "z"]))
## output: processed data frame
dat$ID <- factor(dat$ID, levels = unique(dat$ID),
labels = unique(dat0@ID))
dat$ID <- as.character(dat$ID)
varNames0 <- sapply(seq_len(3L), function(a) {
as.character(formula[[2L]][[a + 1]])
})
colnames(dat)[seq_len(3L)] <- varNames0
## output: na.action
na.action <- if (is.null(attr(mf, "na.action")))
options("na.action")[[1L]]
else
paste0("na.", class(attr(mf, "na.action")))
## results to return
methods::new("iCoxph",
call = Call,
formula = formula,
nObs = nObs,
data = dat,
estimates = list(beta = est_beta,
pi = piEst,
baseline = h0Dat),
start = start,
control = control,
na.action = na.action,
xlevels = .getXlevels(mt, mf),
contrasts = list(attr(mm, "contrasts")),
convergeCode = betaEst$code,
logL = logL0)
}
##' Auxiliary for Controlling iCoxph Fitting
##'
##' Auxiliary function for \code{\link{iCoxph}} that enable users to specify the
##' control parameters of the model estimation procedure. Internally, the
##' arguments \code{cm_gradtol}, \code{cm_stepmax}, \code{cm_steptol}, and
##' \code{cm_max_iter} are passed to function \code{\link[stats]{nlm}} as
##' \code{gradtol}, \code{stepmax}, \code{steptol}, and \code{iterlim},
##' respectively.
##'
##' @param tol_beta A positive value specifying the tolerance that concludes the
##' convergence of the covariate coefficient estimates. The tolerance is
##' compared with the relative change between the estimates from two
##' consecutive iterations that is measured by ratio of the L2-norm of their
##' difference to the sum of their L2-norm. The default value is
##' \code{1e-6}.
##' @param tol_pi A positive value specifying the tolerance that concludes the
##' convergence of the probability estimates of uncertain records being
##' true. The tolerance is compared with the relative change between the
##' estimates from two consecutive iterations measured by ratio of L2-norm
##' of their difference to the L2-norm of their sum. The default value is
##' \code{1e-8}.
##' @param cm_gradtol A positive scalar giving the tolerance at which the scaled
##' gradient is considered close enough to zero to terminate CM steps. The
##' default value is \code{1e-6}.
##' @param cm_stepmax A positive scalar that gives the maximum allowable scaled
##' step length in CM steps. The default value is \code{1e2}.
##' @param cm_steptol A positive scalar providing the minimum allowable relative
##' step length in CM step. The default value is \code{1e-6}.
##' @param cm_max_iter A positive integer specifying the maximum number of
##' iterations to be performed before each CM step is terminated. The
##' default value is \code{1e2}.
##' @param ecm_max_iter A positive integer specifying the maximum number of
##' iterations to be performed before the ECM algorithm is terminated. The
##' default value is \code{2e2}.
##' @param ... Other arguments for future usage. A warning will be thrown if
##' any invalid argument is specified.
##'
##' @return A list of class \code{intsurv-iCoxph.control} containing all
##' specified control parameters.
##'
##' @examples
##' ## See examples of function 'iCoxph'.
##'
##' @seealso
##' \code{\link{iCoxph}} for fitting integrative Cox model.
##'
##' @export
iCoxph.control <- function(tol_beta = 1e-6,
tol_pi = 1e-8,
cm_gradtol = 1e-6,
cm_stepmax = 1e2,
cm_steptol = 1e-6,
cm_max_iter = 1e2,
ecm_max_iter = 2e2,
...)
{
## controls for function stats::nlm
if (! is.numeric(cm_gradtol) || cm_gradtol <= 0)
stop("The value of 'cm_gradtol' must be > 0.", call. = FALSE)
if (! is.numeric(cm_stepmax) || cm_stepmax <= 0)
stop("The value of 'cm_stepmax' must be > 0.", call. = FALSE)
if (! is.numeric(cm_steptol) || cm_steptol <= 0)
stop("The value of 'cm_steptol' must be > 0.", call. = FALSE)
if (! is.numeric(cm_max_iter) || cm_max_iter <= 0)
stop("The maximum number of iterations in the CM step must be > 0.",
call. = FALSE)
## determining convergence of ECM
if (! is.numeric(tol_beta) || tol_beta <= 0)
stop("The value of 'tol_beta' must be positive.", call. = FALSE)
if (! is.numeric(tol_pi) || tol_pi <= 0)
stop("The value of 'tol_pi' must be positive.", call. = FALSE)
if (! is.numeric(ecm_max_iter) || ecm_max_iter <= 0)
stop("The maximum number of ECM iterations must be positive.",
call. = FALSE)
## throw warning if anything is captured by `...`
warn_dots(...)
## parameters in a list
out <- list(tol_beta = tol_beta,
tol_pi = tol_pi,
cm_gradtol = cm_gradtol,
cm_stepmax = cm_stepmax,
cm_steptol = cm_steptol,
cm_max_iter = cm_max_iter,
ecm_max_iter = ecm_max_iter)
## add class for ease of checking
class(out) <- "iCoxph.control"
## return
out
}
##' Auxiliary for Starting iCoxph Fitting
##'
##' Auxiliary function for \code{\link{iCoxph}} that enable users
##' to specify the starting values of the model estimation procedure.
##'
##' @param beta_vec A numeric vector for starting values of coefficient
##' estimates. The default values are the coefficient estimates from the
##' regular Cox model only fitting on records without uncertainty. If
##' censoring rate among subjects having unique certain records is extremely
##' high (> 99%) or such as perfect seperation problem between event
##' indicator and one predictor, the starting values will be reset to be all
##' zeros.
##' @param beta_mat A numeric matrix that consists of additional starting values
##' of coefficient estimates in columns. The default value is \code{NULL}.
##' @param methods A character vector specifying the initialization methods for
##' probabilities of uncertain records being true. The available methods
##' are \code{"nearest_hazard"} for initializing baseline hazard by nearest
##' (left) neighbor, and \code{"unit_hazard"} for initializing unit baseline
##' hazard. Partial matching on method names is supported for ease of
##' typing. By default, both methods are used. See Wang et al. (2020) for
##' details of the initialization methods.
##' @param ... Other arguments for future usage. A warning will be thrown if
##' any invalid argument is specified.
##'
##' @return A list of class \code{intsurv-iCoxph.start} containing all specified
##' starting values of the parameters to be estimated from the model.
##'
##' @examples
##' ## See examples of function 'iCoxph'.
##'
##' @seealso
##' \code{\link{iCoxph}} for fitting integrative Cox model.
##'
##' @export
iCoxph.start <- function(beta_vec = NULL,
beta_mat = NULL,
methods = c("nearest_hazard", "unit_hazard"),
...)
{
## match methods
methods <- match.arg(methods, several.ok = TRUE)
## throw out warning if any invalid argument is captured by `...`
warn_dots(...)
## parameters in a list
out <- list(beta_vec = beta_vec,
beta_mat = beta_mat,
methods = methods)
## add class for ease of checking
class(out) <- "iCoxph.start"
## return
out
}
### internal functions =========================================================
## perform one step of ECM algorithm
ic_oneECMstep <- function(betaHat, h0Dat, h_cDat, dat, xMat, tied, control)
{
## update results involving beta estimates
dat$betaX <- as.numeric(xMat %*% betaHat)
dat$xExp <- exp(dat$betaX)
dat$xExp <- ifelse(is.infinite(dat$xExp), 1e50, dat$xExp)
## use initial value of pi
dat$p_jk <- dat$piVec
h0Dat$h0Vec <- ic_h0t(dat, tied = tied)
h_cDat$h_cVec <- ic_h_c(dat, tied = tied)
## help converge more quickly
dat$p_jk <- ifelse(dat$p_jk < 1e-3, 0, ifelse(dat$p_jk > 1 - 1e-3,
1, dat$p_jk))
## update baseline hazard rate of event times
h0Dat$H0Vec <- cumsum(h0Dat$h0Vec)
## update baseline hazard rate of censoring times
h_cDat$H_cVec <- cumsum(h_cDat$h_cVec)
if (tied) {
time_idx <- match(dat$time, h0Dat$time)
dat$h0Vec <- with(dat, ifelse(eventIdx, h0Dat$h0Vec[time_idx], 0))
dat$H0Vec <- h0Dat$H0Vec[time_idx]
dat$h_cVec <- with(dat, ifelse(eventIdx, 0, h_cDat$h_cVec[time_idx]))
dat$H_cVec <- h_cDat$H_cVec[time_idx]
} else {
dat$h0Vec <- with(dat, ifelse(eventIdx, h0Dat$h0Vec, 0))
dat$H0Vec <- h0Dat$H0Vec
dat$h_cVec <- with(dat, ifelse(eventIdx, 0, h_cDat$h_cVec))
dat$H_cVec <- h_cDat$H_cVec
}
dat$hVec <- with(dat, h0Vec * xExp)
dat$HVec <- with(dat, H0Vec * xExp)
dat$sVec <- exp(- dat$HVec)
dat$G_cVec <- exp(- dat$H_cVec)
## compute p_jk for each subject
## for observed log-likelihood function
dat$w_jk <- with(dat, ifelse(eventIdx,
piVec * hVec * sVec * G_cVec,
piVec * sVec * h_cVec * G_cVec))
dat$p_jk_denom <- with(dat, aggregateSum(w_jk, ID, simplify = FALSE))
dat$p_jk <- with(dat, ifelse(dupIdx, w_jk / p_jk_denom, 1))
## update h_0(t)
h0Dat$h0Vec <- ic_h0t(dat, tied = tied)
## update h_c(t)
h_cDat$h_cVec <- ic_h_c(dat, tied = tied)
## update beta
betaEst <- stats::nlm(ic_logLbeta, p = betaHat,
dat = dat, xMat = xMat,
tied = tied,
hessian = FALSE,
check.analyticals = FALSE,
gradtol = control$cm_gradtol,
stepmax = control$cm_stepmax,
steptol = control$cm_steptol,
iterlim = control$cm_max_iter)
## log-likelihood function under observed data
logL <- sum(log(dat$p_jk_denom))
## update h0_jk and h_c_jk with previous (or initial) estimates of beta
## h0Vec <- ic_h0t(dat, tied)
## h_cVec <- ic_h_c(dat, tied)
list(betaEst = betaEst,
h0Vec = h0Dat$h0Vec,
h_cVec = h_cDat$h_cVec,
piVec = dat$p_jk,
logL = logL,
xExp = dat$xExp)
}
## profile log-likelihood function of beta
ic_logLbeta <- function(param, dat, xMat, tied)
{
## update retults depends on beta estimates, param
dat$betaX <- as.vector(xMat %*% param)
dat$xExp <- exp(dat$betaX)
dat$xExp <- ifelse(is.infinite(dat$xExp), 1e50, dat$xExp)
## prepare intermediate results for later computation
parSeq <- seq_along(param)
xMatDeltaN <- xMat[dat$eventIdx, ] * dat[dat$eventIdx, "p_jk"]
delta_tildeN <- ic_deltaTildeN(dat, tied)
betaXdeltaN <- with(dat, eventIdx * betaX * p_jk)
k_0 <- ic_k0(dat, tied)
k_1 <- ic_k1(parSeq, dat, xMat, tied)
k_2 <- ic_k2(parSeq, dat, xMat, tied)
## profile log-likelihood of beta in EM
pell <- sum(betaXdeltaN) - sum(ifelse(delta_tildeN > 0,
delta_tildeN * log(k_0), 0))
## penalty term to avoid solution with xExp being Inf
penal_inf <- any(is.infinite(dat$xExp)) * 1e20
negLogL <- - pell + penal_inf
## gradient
gradLogL <- ic_dLbeta(xMatDeltaN, k_0, k_1, delta_tildeN)
attr(negLogL, "gradient") <- - gradLogL
## hessian
hesMat <- ic_d2Lbeta(parSeq, k_0, k_1, k_2, delta_tildeN)
attr(negLogL, "hessian") <- - hesMat
## return
negLogL
}
## compute baseline hazard rate function
ic_h0t <- function(dat, tied) {
numer <- ic_deltaTildeN(dat, tied)
denom <- ic_k0(dat, tied)
ifelse(denom > 0, numer / denom, 0)
}
## building blocks that follows notation in manuscript
ic_deltaTildeN <- function(dat, tied) {
out <- with(dat, as.numeric(eventIdx) * p_jk)
if (tied)
return(aggregateSum(out, dat$time))
out
}
## baseline hazard rate for censoring time
ic_h_c <- function(dat, tied) {
numer <- ic_deltaC(dat, tied)
denom <- revcumsum(dat$p_jk)
if (tied)
denom <- denom[dat$firstIdx]
ifelse(numer > 0, numer / denom, 0)
}
ic_deltaC <- function(dat, tied) {
out <- with(dat, as.numeric(! eventIdx) * p_jk)
if (tied)
return(aggregateSum(out, dat$time))
out
}
ic_k0 <- function(dat, tied) {
p_jk_xExp <- with(dat, p_jk * xExp)
out <- revcumsum(p_jk_xExp)
if (tied)
out <- out[dat$firstIdx]
out
}
ic_k1 <- function(parSeq, dat, xMat, tied) {
## matrix of dimension # unique event time by # parameters
out <- sapply(parSeq, function(ind, dat, xMat) {
p_jk_xExp_x <- with(dat, p_jk * xExp) * xMat[, ind]
revcumsum(p_jk_xExp_x)
}, dat = dat, xMat = xMat)
if (tied)
out <- out[dat$firstIdx, ]
out
}
ic_k2 <- function(parSeq, dat, xMat, tied) {
ind_grid <- expand.grid(parSeq, parSeq)
## matrix of dimension # unique event time by (# parameters) ^ 2
out <- mapply(function(ind1, ind2) {
p_jk_xExp_x2 <- with(dat, p_jk * xExp) * xMat[, ind1] * xMat[, ind2]
revcumsum(p_jk_xExp_x2)
}, ind_grid[, 1L], ind_grid[, 2L])
if (tied)
out <- out[dat$firstIdx, ]
out
}
ic_dLbeta <- function(xMatDeltaN, k_0, k_1, delta_tildeN) {
sum_jk <- colSums(xMatDeltaN)
int_t <- colSums(k_1 / k_0 * delta_tildeN, na.rm = TRUE)
sum_jk - int_t
}
ic_d2Lbeta <- function(parSeq, k_0, k_1, k_2, delta_tildeN) {
## part 1
nPar <- length(parSeq)
mat1 <- k_2 / k_0 * delta_tildeN
part1 <- matrix(colSums(mat1, na.rm = TRUE), nPar, nPar)
## part 2
mat2 <- k_1 / k_0
mat2[is.na(mat2)] <- 0
parGrid <- expand.grid(parSeq, parSeq)
mat2plus <- mapply(function(ind1, ind2) {
mat2[, ind1] * mat2[, ind2]
}, parGrid[, 1L], parGrid[, 2L])
part2 <- matrix(colSums(mat2plus * delta_tildeN), nPar, nPar)
## return
part2 - part1
}
## determine initial piVec from given censorRate
ic_initPi <- function(censorRate, dat, equally = FALSE, ...)
{
## nonsense to eliminate cran checking note
eventIdx <- NULL
## mixture probability for each subject: piVec
numTab <- table(dat$ID)
dat$numRecord <- numTab[match(as.character(dat$ID), names(numTab))]
## subject ID with censoring records
cenID <- with(base::subset(dat, ! eventIdx), unique(ID))
cenIdx <- as.integer(dat$ID %in% cenID)
## for subjects with multiple records
idx1 <- with(dat, numRecord > 1L & eventIdx)
idx2 <- with(dat, numRecord > 1L & (! eventIdx))
piVec <- rep(1, NROW(dat))
if (equally) {
piVec[idx1 | idx2] <- 1 / dat[idx1 | idx2, "numRecord"]
} else {
piVec[idx1] <- (1 - censorRate * cenIdx[idx1]) /
(dat[idx1, "numRecord"] - cenIdx[idx1])
piVec[idx2] <- censorRate
}
piVec
}
## assign initial piVec not based on censoring
ic_initPi2 <- function(pi0, dat, randomly = FALSE, ...)
{
nData <- NROW(dat)
piVec <- rep(1L, nData)
dupIdx <- duplicated(dat$ID)
dupID <- dat$ID[dupIdx]
uniIdx <- ! dat$ID %in% unique(dupID)
numTab <- table(dupID)
numRecord <- numTab[match(as.character(dupID), names(numTab))]
firstIdx <- ! (dupIdx | uniIdx)
piVec[firstIdx] <- pi0
piVec[dupIdx] <- (1 - pi0) / numRecord
## randomly impute within subject
if (randomly) {
randomOrder <- stats::runif(nData)
piVec <- piVec[order(dat$ID, randomOrder)]
}
## return
piVec
}
## \item \code{censorRate}: A positive numeric vector taking values between 0
## and 1. If a vector of length more than one is given, multiple starting
## values on prior probabilities are applied. In each initialization,
## individual value in the vector is taken as parameter in the multinomial
## distribution indicating the prior probability of censoring record being
## true. The remaining probability mass is equally assigned to the
## possible event records. For example, when \code{censorRate = 1} is
## specified, the extended model reduces to the regular Cox model fitting
## data before matching, which takes all the censoring records as true. On
## the contrary, when \code{censorRate = 0} is specified, the model does
## not take possible censoring records into account. If \code{multiStart =
## FALSE}, the default value of \code{censorRate} is the sample censoring
## rate among subjects having records without uncertainty.
## If \code{parametricOnly}
##
## \item \code{multiStart}: A logical value specifying whether multiple
## starting values of \code{censorRate} is needed. The argument is ignored
## when explicit \code{censorRate} or \code{piVec} is specified. The
## default value is \code{FALSE} for less computation time. However,
## enabling multiple starting values \code{multiStart = TRUE} is suggested
## to reduce the influence of possible local maximizer. If \code{TRUE},
## \code{censorRate} takes a grid from 0 to 1 with step 0.02 by default.
##
## }
##
## take care of start values internally
iCoxph_start <- function(beta_vec = NULL,
beta_mat = NULL,
methods = NULL,
...,
nBeta_,
dat_)
{
## nonsense to eliminate cran checking note
ID <- NULL
dupID <- with(dat_, unique(ID[duplicated(ID)]))
dupIdx <- dat_$ID %in% dupID
uniDat <- base::subset(dat_, ! dupIdx)
uni_xMat <- as.matrix(uniDat[, - seq_len(3L)])
xMat <- as.matrix(dat_[, - seq_len(3L)])
## censorRate0 <- round(1 - mean(uniDat$event), 2)
## if (is.null(censorRate)) {
## ## set censorRate from sample truth data
## ## if missing at random, the true censoring rate
## ## can be estimated by true data of unique records
## step_by <- 0.02
## censorRate <- if (multiStart) {
## seq.int(0, 1, step_by)
## } else if (semiparametric_only) {
## NA_real_
## } else {
## censorRate0
## }
## } else if (any(censorRate > 1 | censorRate < 0)) {
## stop("Starting prob. of censoring case being true",
## "should between 0 and 1.", call. = FALSE)
## }
## use parametric estimate as starting values
## if (any(parametric)) {
## event_funs <- ic_parametric_start(uniDat$time, uniDat$event, uni_xMat)
## cen_funs <- ic_parametric_start(uniDat$time, 1 - uniDat$event)
## hVec <- event_funs$haz_fun(dat_$time, xMat)
## sVec <- event_funs$surv_fun(dat_$time, xMat)
## h_cVec <- cen_funs$haz_fun(dat_$time)
## G_cVec <- cen_funs$surv_fun(dat_$time)
## if (parametric[1L]) {
## ## following the equations derived
## log_w_jk_1 <- ifelse(
## dat_$event > 0,
## log(hVec) + log(sVec) + log(G_cVec),
## log(h_cVec) + log(G_cVec) + log(sVec)
## )
## w_jk_1 <- exp(log_w_jk_1)
## p_jk_denom_1 <- aggregateSum(w_jk_1, dat_$ID, simplify = FALSE)
## pi_par_1 <- ifelse(dupIdx, w_jk_1 / p_jk_denom_1, 1)
## piMat <- cbind(piMat, pi_par_1)
## }
## if (length(parametric) > 1L && parametric[2L]) {
## ## if hazard estimates is not reliable
## log_w_jk_2 <- log(sVec) + log(G_cVec)
## w_jk_2 <- exp(log_w_jk_2)
## p_jk_denom_2 <- aggregateSum(w_jk_2, dat_$ID, simplify = FALSE)
## pi_par_2 <- ifelse(dupIdx, w_jk_2 / p_jk_denom_2, 1)
## piMat <- cbind(piMat, pi_par_2)
## }
## }
## use non/semi-parametric estimates
if (is.null(methods)) {
stop("No iniliaization method was specified.", call. = FALSE)
}
pi_mat <- NULL
## initialization of baseline functions
event_funs <- ic_semi_parametric_start(
uniDat$time, uniDat$event, uni_xMat
)
cen_funs <- ic_semi_parametric_start(uniDat$time, 1 - uniDat$event)
hVec <- event_funs$haz_fun(dat_$time, xMat)
sVec <- event_funs$surv_fun(dat_$time, xMat)
h_cVec <- cen_funs$haz_fun(dat_$time)
G_cVec <- cen_funs$surv_fun(dat_$time)
## initialized baseline hazards with nearest (left) neighbor
if ("nearest_hazard" %in% methods) {
## following the equations derived
log_w_jk_3 <- ifelse(
dat_$event > 0,
log(hVec) + log(sVec) + log(G_cVec),
log(h_cVec) + log(G_cVec) + log(sVec)
)
w_jk_3 <- exp(log_w_jk_3)
p_jk_denom_3 <- aggregateSum(w_jk_3, dat_$ID, simplify = FALSE)
pi_par_3 <- ifelse(dupIdx, w_jk_3 / p_jk_denom_3, 1)
pi_mat <- cbind(pi_mat, "nearest_hazard" = pi_par_3)
}
## use unit baseline hazards (or equivalently ignore them)
if ("unit_hazard" %in% methods) {
log_w_jk_4 <- log(sVec) + log(G_cVec)
w_jk_4 <- exp(log_w_jk_4)
p_jk_denom_4 <- aggregateSum(w_jk_4, dat_$ID, simplify = FALSE)
pi_par_4 <- ifelse(dupIdx, w_jk_4 / p_jk_denom_4, 1)
pi_mat <- cbind(pi_mat, "unit_hazard" = pi_par_4)
}
## initialize covariate coefficient: beta
if (is.null(beta_vec)) {
## if high censoring for subjects having unique records
if (mean(uniDat$event) < 0.01) {
beta_vec <- matrix(rep(0, nBeta_), ncol = 1)
} else {
uniDat$eventIdx <- NULL
tmp <- tryCatch(
with(uniDat, rcpp_coxph(time, event, uni_xMat)),
warning = function(w) {
warning(w)
return(NULL)
}, error = function(e) {
warning(e)
return(NULL)
})
beta <- if (is.null(tmp)) {
rep(0, nBeta_)
} else {
as.numeric(tmp$coef)
}
beta_vec <- as.matrix(beta)
}
}
beta_mat <- cbind(beta_mat, beta_vec)
## some quick checks on beta
if (nrow(beta_mat) != nBeta_)
stop(wrapMessages(
"The number of starting values for coefficients of",
"covariates does not match with the specified formula."
), call. = FALSE)
## some quick checks on pi
if (! is.null(pi_mat)) {
if (nrow(pi_mat) != nrow(dat_))
stop("'pi_mat' must have same length with number of rows of data.")
if (any(pi_mat > 1 | pi_mat < 0))
stop("'pi_mat' has to be between 0 and 1.")
}
## return
list(beta_mat = beta_mat,
pi_mat = pi_mat)
}
### return parametric hazard function and survival function
## ic_parametric_start <- function(time, event, xMat = NULL)
## {
## ## transform estimates from AFT form to PH form for Weibull model
## transCoef <- function(survRegObj) {
## shape <- 1 / survRegObj$scale
## betaEst <- survRegObj$coefficients
## lambda0 <- exp(- shape * betaEst[1])
## names(lambda0) <- NULL
## betaEst <- - shape * betaEst[- 1]
## list(beta = betaEst, lambda0 = lambda0)
## }
## ## fitting with survreg
## fm <- if (is.null(xMat)) {
## survival::Surv(time, event) ~ 1
## } else {
## survival::Surv(time, event) ~ xMat
## }
## fit <- survival::survreg(fm)
## shape <- 1 / fit$scale
## fit_list <- transCoef(fit)
## betaHat <- fit_list$beta
## lambda0 <- fit_list$lambda0
## ## (baseline) hazard function and survival function
## haz_fun <- function(time, xMat = NULL) {
## betaX <- if (! is.null(xMat)) {
## as.numeric(xMat %*% betaHat)
## } else {
## 0
## }
## exp(log(lambda0) + log(shape) +
## (shape - 1) * log(time) + betaX)
## }
## surv_fun <- function(time, xMat = NULL) {
## betaX <- if (! is.null(xMat)) {
## as.numeric(xMat %*% betaHat)
## } else {
## 0
## }
## HVec <- exp(log(lambda0) + shape * log(time) + betaX)
## exp(- HVec)
## }
## ## return
## list(haz_fun = haz_fun,
## surv_fun = surv_fun)
## }
### return semi-parametric hazard function and survival function
ic_semi_parametric_start <- function(time, event, xMat = NULL)
{
## set default value
fit <- NULL
if (! is.null(xMat)) {
## fitting may fail in bootstrap samples
fit <- tryCatch(rcpp_coxph(time, event, xMat),
warning = function(w) {
return(NULL)
},
error = function(e) {
return(NULL)
})
}
if (is.null(fit)) {
haz0 <- rcpp_mcf_right(time, event)
haz0 <- data.frame(time = haz0$time,
hazard = haz0$cum_rate)
} else {
betaHat <- as.numeric(fit$coef)
haz0 <- data.frame(time = fit$baseline$time,
hazard = fit$baseline$h0)
}
## stepfun hazard function
haz_fun <- function(time, xMat = NULL) {
betaX <- if (! is.null(xMat)) {
as.numeric(xMat %*% betaHat)
} else {
0
}
## only consider positive hazard here
inst_haz0 <- diff(c(0, haz0$hazard))
inst_bool <- inst_haz0 > 0
base_haz0 <- c(haz0$hazard[inst_bool][1], inst_haz0[inst_bool])
base_haz_fun <- stats::stepfun(haz0$time[inst_bool], base_haz0)
base_haz_fun(time) * exp(betaX)
}
## survival function
surv_fun <- function(time, xMat = NULL) {
betaX <- if (! is.null(xMat)) {
as.numeric(xMat %*% betaHat)
} else {
0
}
haz_fun <- stats::stepfun(haz0$time, c(0, haz0$hazard))
exp(- haz_fun(time) * exp(betaX))
}
## return
list(haz_fun = haz_fun,
surv_fun = surv_fun)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.