R/pred.2ph.R
In addhazard: Fit Additive Hazards Models for Survival Analysis

#' Prediction Based on the Additive Hazards Model Fitted from Two-phase Sampling
#'
#' This function predicts a subject's overall hazard rates at given time points
#' based on this subject's covariate values. The prediction function is an object
#' from \code{\link{ah.2ph}}. The  estimating procedures follow Hu (2014).
#'
#' @param object an object of class inhering from 'ah.2ph'.
#' @param newdata a dataframe of an individual's predictors.
#' @param newtime  a given sequence of time points at which the prediction is performed.
#' @param ...  further arguments passed to or from other methods.
#'
#' @return A dataframe including the given time points, predicted hazards, their
#'  standard errors, their variances, the phase I component of the variance for
#'  predicted hazards and the phase II component of the variance.
#'
#' @seealso \code{\link{ah.2ph}} for fitting the additive hazards model with two-phase
#   sampling and \code{\link{nwtsco}} for the description of nwtsco dataset
#'
#' @importFrom survival Surv
#' @importFrom stats delete.response model.matrix terms
#' @export
#'
#' @references
#' Jie Hu (2014) A Z-estimation System for Two-phase Sampling with Applications
#'               to Additive Hazards Models and Epidemiologic Studies. Dissertation,
#'               University of Washington.
#'
#' @examples
#' library(survival)
#' ### load data
#' nwts <- nwtsco[1:100,]
#'
#' ### create strata based on  institutional histology and disease status
#' nwts$strt <- 1+nwts$instit
#' ### add a stratum containing all (relapsed) cases
#' nwts$strt[nwts$relaps==1] <- 3
#'
#' ### assign phase II subsampling probabilities
#' ### oversample unfavorable histology (instit =1) and cases
#' ### Pi = 0.5 for instit =0, Pi =1 for instit =1 and relaps =1
#' nwts$Pi<-  0.5 * (nwts$strt == 1) + 1 * (nwts$strt == 2) + 1 * (nwts$strt == 3)
#'
#' ### generate phase II sampling indicators
#' N <- dim(nwts)[1]
#' nwts$in.ph2 <-  rbinom(N, 1, nwts$Pi)
#'
#' ### fit an additive hazards model to  two-phase sampling data without calibration
#' fit1 <- ah.2ph(Surv(trel,relaps) ~ age + histol,
#'                data = nwts,
#'                ties = FALSE,
#'                R = in.ph2, Pi = Pi,
#'                robust = FALSE)
#'
#' ###  input the new data for prediction
#' newdata <- nwtsco[101,]
#' ###  based on the fitted model fit1, perform prediction at time points t =3 and t= 5
#' predict(fit1, newdata, newtime = c(3,5))
#'
#' ### fit an additve hazards model to  two-phase sampling data with calibration
#' ### The calibration variable is stage
#' fit2 <- ah.2ph(Surv(trel,relaps) ~ age + histol, data = nwts, R = in.ph2, Pi = Pi,
#'                                    ties = FALSE, robust = FALSE, calibration.variables = "stage")
#'
#' ### based on the fitted model fit2, perform prediction at time points t =3 and t= 5
#' predict(fit2, newdata, newtime = c(3,5))
#'
#' \dontrun{
#' ### The calibration variable is stage, when set robust = TRUE
#' fit3 <- ah.2ph(Surv(trel,relaps) ~ age + histol, data = nwts, R = in.ph2, Pi = Pi,
#'                                    ties = FALSE, robust = TRUE, calibration.variables = "stage")
#'
#' ### based on the fitted model fit2, perform prediction at time points t =3 and t= 5
#' predict(fit3, newdata, newtime = c(3,5))
#' }

predict.ah.2ph <- function(object, newdata, newtime, ...) {

    # phase I complete data object
    object1 <- object$fit.pha1

    ######################### creat model matrix for the new data ##########
    tt <- terms(object1)
    if (!inherits(object, "ah.2ph")) {
        warning("calling predict.2ph(<fake-ah-object>) ...")
    }

    Terms <- delete.response(tt)
    # identify factor variable
    factor.var.list <- names(object1$model)[sapply(object1$model,
        is.factor) == T]
    # assign levels to newdata variable according to the
    # original data
    if (length(factor.var.list) != 0) {
        for (i in 1:length(factor.var.list)) {
            factor.var <- factor.var.list[i]
            newdata[[factor.var]] <- factor(newdata[[factor.var]],
                levels = levels(object1$model[[factor.var]]))
        }
    }
    # retrieve the new model matrix based on the new data
    pred <- model.matrix(Terms, newdata)
    # delete the intercept
    pred <- as.numeric(pred[, -1])
    ###########################################################################

    Pi.pha2 <- object$Pi.pha2

    if (object1$robust) {
        stop("When you set robust = FALSE, you assume the additive hazards model
             is not the true model. Prediction based on this model is no longer valid.
             We stop the prediction process. ")
    } else {
         ## No calibration
        if (!length(object$calibration.variables)) {

          pred.result <- predict.ah(object1, newdata, newtime,
                                    level = 0.95)
          L <- pred.result$L
          L.var.pha1 <- pred.result$L.se^2

          ### calculate the  variance due to phase II sampling
          ingr <- ingredients(object1)

            ############################ find the positions of a list of given newtime on the
            ############################ original timeline######################
            t.pos <- NULL
            t.unique <- ingr$t.unique

            for (i in 1:length(newtime)) {


                # newtime s is less than t_1, then we report
                if (newtime[i] < t.unique[1]) {
                  print(paste("The data used for building the ah model does not have
                               enough information for predicting such a small t, t=",
                              newtime[i]))
                } else {

                  # find the position of newtime[i] on the original timeline
                  c <- rank(c(newtime[i], t.unique), ties.method = "max")[1]
                  # when newtime s equals one of the t.unique t_k, the above
                  # rank function returns k+1 on the original timelines when
                  # newtime s in between t_k and t_{k+1}, it returns k+1 on
                  # the original timelines when newtime s = t_1, it returns 1
                  # thus
                  t.pos[i] <- ifelse(c == 1, 1, c - 1)


                }
            }

            ### predicted outcomes are calculated based on fitting an ah
            ### model to only phase II data using assigned weights



            # Calculate the phase II variance
            L.resid <- do.call("cook.L.resid", c(ingr, list(time.pos = ingr$match.event)))




            ingr$wts <- sqrt(1 - Pi.pha2)
            L.var.pha2 <- do.call("L.rvar", c(ingr, object1,
                list(t.pos = t.pos, pred = pred, newtime = newtime,
                  L.resid = L.resid)))

        } else {
            # Calibration

            pred.result <- predict.ah(object1, newdata, newtime,
                                    level = 0.95)
            L <- pred.result$L
            ### Calculate the 1st component of the variance
            L.var.pha1 <- pred.result$L.se^2



            wts.cal <- object1$weights
             ingr <- ingredients(object1)

            aux<-as.matrix(object$calibration.variables)
            aux.pha2<-aux[object$R==1,]
            P<-t(aux)%*%(aux)


            t.pos <- NULL
            t.unique <- ingr$t.unique

            for (i in 1:length(newtime)) {
                # newtime s is less than t_1, then we report
                if (newtime[i] < t.unique[1]) {
                  print(paste("The data used for building the ah model does not
                               have enough information for predicting such a small t, t=",
                              newtime[i]))
                } else {

                  # find the position of newtime[i] on the original timeline
                  c <- rank(c(newtime[i], t.unique), ties.method = "max")[1]
                  # when newtime s equals one of the t.unique t_k, the above
                  # rank function returns k+1 on the original timelines when
                  # newtime s in between t_k and t_{k+1}, it returns k+1 on
                  # the original timelines when newtime s = t_1, it returns 1
                  # thus
                  t.pos[i] <- ifelse(c == 1, 1, c - 1)


                }
            }



            ###Calculate the variance of the predicted value See page 109 ####
            #### of Jie Hu' thesis for the detailed formula #######



            ## retrieve the martingale integral, i.e.  part of \psi in
            ## the formula on page 109 of Jie Hu' thesis

            L.resid <- do.call("cook.L.resid", c(ingr, list(time.pos = ingr$match.event)))

            L.var.pha2 <- do.call("L.rvar.calibration", c(ingr,
                object1, list(t.pos = t.pos, pred = pred, newtime = newtime,
                  L.resid = L.resid, new.wts = sqrt((1 - Pi.pha2)/(wts.cal *
                    Pi.pha2)), aux.pha2 = aux.pha2, P = P, wts.cal = wts.cal)))
        }
    }




    # L.var.pha2<-L.rvar(match.event=match.event,new.wts=sqrt(1-Pi.pha2),L.ncol=L.ncol,
    # eta.cum=eta.cum, resid=resid, L.resid=L.resid,iA=iA, B=B)

    L.var = L.var.pha1 + L.var.pha2
    L.se = sqrt(L.var)
    return(data.frame(L = L, L.se = L.se, L.var = L.var, L.var.pha1 = L.var.pha1,
        L.var.pha2 = L.var.pha2))
}