SurvTrunc: Analysis of Doubly Truncated Data

Documented in coxDT

#'Fit Cox Proportional Hazards Regression Model Under Double Truncation
#'
#'Fits a Cox proportional hazards regression model when the survival time is subject to both left and right truncation. Assumes that no censoring is present in the data.
#'@importFrom survival coxph Surv
#'@importFrom stats model.frame model.matrix model.response na.omit pnorm quantile sd
#'@param formula a formula object, with the response on the left of a ~ operator, and the terms on the right. The response must be a survival object as returned by the \code{Surv} function. NOTE: \code{coxDT} does not handle censoring.
#'@param L vector of left truncation times
#'@param R vector right truncation times
#'@param data mandatory data.frame matrix needed to interpret variables named in the \code{formula}
#'@param subset an optional vector specifying a subset of observations to be used in the fitting process. All observations are included by default.
#'@param time.var default = FALSE. If TRUE, specifies that time varying covariates are fit to the data.
#'@param subject a vector of subject identification numbers. Only needed if time.var=TRUE (see example).
#'@param B.SE.np number of iterations for bootstrapped standard error (default = 200)
#'@param CI.boot requests bootstrap confidence intervals (default==FALSE)
#'@param B.CI.boot number of iterations for bootstrapped confidence intervals (default = 2000)
#'@param pvalue.boot requests bootstrap confidence intervals (default==FALSE)
#'@param B.pvalue.boot number of iterations for numerator (estimate) of bootstrapped test statistic (default = 500)
#'@param B.pvalue.se.boot number of iterations for denominator (standard error) of bootstrapped test statistic (default = 100)
#'@param trunc.weight Truncates weights at a prespecified level (default=100). Trade off is a slight increase in bias for reduction in variance.
#'@param print.weights requests the output of nonparametric selection probabilities (default==FALSE)
#'@param error convergence criterion for nonparametric selection probabilities (default = 10e-6)
#'@param n.iter maximum number of iterations for computation of nonparamteric selection probabilities (default = 1000)
#'@details Fits a Cox proportional hazards model in the presence of left and right truncation
#'by weighting each subject in the score equation of the Cox model by the probability that they
#'are observed in the sample. These selection probabilities are computed nonparametrically.
#'The estimation procedure here is performed using coxph {survival} and inserting these estimated
#'selection probabilities in the 'weights' option. This method assumes that the survival and
#'truncation times are independent. Furthermore, this method does not accommodate censoring.
#'Note: If only left truncation is present, set R=infinity.
#'If only right truncation is present, set L = -infinity.
#'
#'@return
#'\item{results.beta}{Displays the estimate, standard error, lower and upper 95\% Wald confidence limits,
#'Wald test statistic and corresponding p-value for each regression coefficient}
#'\item{CI}{Method used for computation of confidence interval: Normal approximation (default) or bootstrap}
#'\item{p.value}{Method used for computation of p-values: Normal approximation (default) or bootstrap}
#'\item{weights}{If print.weights=TRUE, displays the weights used in the Cox model}
#'@references Rennert L and Xie SX (2017). Cox regression model with doubly truncated data. Biometrics. http://dx.doi.org/10.1111/biom.12809.
#'@export
#'@examples
#'###### Example: AIDS data set #####
#'coxDT(Surv(Induction.time,status)~Adult,L.time,R.time,data=AIDS,B.SE.np=2)
#'
#'# WARNING: To save computation time, number of bootstrap resamples for standard error set to 2.
#'# Note: The minimum recommendation is 200, which is the default setting.
#'@examples
#'##### Including time-dependent covariates #####
#'# Accomodating time-dependent covariates in the model is similar to the accomodation in coxph
#'
#'# The data set may look like the following:
#'
#'# subject start stop event treatment test.score L.time R.time
#'# 1       T.10  T.11  1    X.1       Z.1        L.1    R.1
#'# 2       T.20  T.21  0    X.2       Z.21       L.2    R.2
#'# 2       T.21  T.22  1    X.2       Z.22       L.2    R.2
#'#...
#'
#'# Here the variable 'treatment' and the trunction times 'L.time' and 'R.time' stay the same
#'# from line to line. The variable 'test.score' will vary line to line. In this example,
#'# subject 1 has only one recorded measurement for test.score, and therefore only has one row
#'# of observations. Subject 2 has two recorded measurements for test score, and therefore has
#'# two rows of observations. In this example, it is assumed that the test score for subject 2
#'# is fixed at Z.21 between (T.20,T.21] and fixed at Z.22 between (T.21,T.22]. Notice that
#'# the event indicator is 0 in the first row of observations corresponding to subject 2,
#'# since they have not yet experienced the event. The status variable changes to 1 in the
#'# row where the event occurs.
#'
#'# Note: Start time cannot preceed left truncation time and must be strictly less than stop time.
#'
#'# example
#'
#'
#'test.data <- data.frame(
#'list(subject.id = c(1, 2, 2, 3, 4, 4, 5, 6, 7, 8, 8, 9, 10),
#'     start      = c(3, 5, 7, 2, 1, 2, 6, 5, 6, 6, 7, 2, 17),
#'     stop       = c(4, 7, 8, 5, 2, 6, 9, 8, 7, 7, 9, 6, 21),
#'     event      = c(1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1),
#'     treatment  = c(0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1),
#'     test.score = c(5, 6, 7, 4, 6, 9, 3, 4, 4, 7, 6, 4, 12),
#'     L.time     = c(2, 4, 4, 2, 1, 1, 4, 5, 4, 3, 3, 1, 10),
#'     R.time     = c(6, 9, 9, 6, 7, 7, 9, 9, 8, 8, 9, 8, 24)))
#'
#'  coxDT(Surv(start,stop,event)~treatment+test.score,L.time,R.time,data=test.data,
#'  time.var=TRUE,subject=subject.id,B.SE.np=2)




coxDT = function(formula,L,R,data,subset,time.var=FALSE,subject=NULL,B.SE.np=200,CI.boot=FALSE,B.CI.boot=2000,pvalue.boot=FALSE,
                 B.pvalue.boot=500,B.pvalue.se.boot=100,trunc.weight=100,print.weights=FALSE,error=10^-6,n.iter=1000)
{
  if(missing(data)==TRUE) stop("Must specify data fame in 'data =' statement");

  set.seed(1312018)
  data=data[subset,]

  # removing missing observations
  nrows.data=dim(data)[1];
  data=na.omit(data);
  nrows.data.omit=dim(data)[1];


  # extracting outcomes and covariates
  mf = model.frame(formula=formula,data=data)
  X=model.matrix(attr(mf,"terms"),data=mf)[,-1]
  p=1; n=length(X);
  if(length(dim(X))>0) {p=dim(X)[2]; n=dim(X)[1]}  # number of predictors and observations

  Y=as.numeric(model.response(mf))[1:n];

  # extracting truncation times
  L=deparse(substitute(L)); R=deparse(substitute(R));
  formula.temp=paste(L,R,sep="~")
  mf.temp=model.frame(formula=formula.temp,data=data)
  obs.data=sapply(rownames(mf),rownames(mf.temp),FUN=function(x,y) which(x==y))
  L=mf.temp[obs.data,1]; R=mf.temp[obs.data,2];



  # weight estimation
  P.obs.y.np=cdfDT(Y,L,R,error,n.iter,display=FALSE)$P.K # estimating selection probabilities for each subject
  weights.np=1/P.obs.y.np                   # weights
  ### truncating weights ###
  weights.np[which(weights.np>trunc.weight)]=trunc.weight
  ### # # # # # # # # #  ###
  P.obs.NP=n*(sum(1/P.obs.y.np))^(-1)       # estimating probability that random subject observed


  # computing estimates of nonparametric weighted estimator
  data.new=data.frame(data,weights.np)
  beta.np=coxph(formula,data=data.new,weights=weights.np)$coefficients

  # computing bootstrapped standard errors

  # first, we import the vector of subject id's for bootstrapping data with time-varying coefficients
  if(time.var==TRUE)
  {
  subject=deparse(substitute(subject))
  formula.temp2=paste(subject,subject,sep="~");
  subjects=model.response(model.frame(formula.temp2,data=data));
  n.subject=length(unique(subjects));
  }

  B=B.SE.np
  if(CI.boot==TRUE) B=max(B.SE.np,B.CI.boot)

  beta.boot.np=matrix(0,nrow=B,ncol=p)
  for(b in 1:B) {
    repeat{ # creating repeat loop in case Shen algorithm fails
      if(time.var==FALSE) {temp.sample=sort(sample(n,replace=TRUE))};
      if(time.var==TRUE) {
        temp.obs=sort(sample(n.subject,replace=TRUE))
        temp.sample=unlist(sapply(temp.obs,subjects,FUN=function(x,y) which(x==y)))
      }
      Y.temp=Y[temp.sample]; L.temp=L[temp.sample]; R.temp=R[temp.sample];
      if(p==1) X.temp=X[temp.sample];
      if(p>=2) X.temp=X[temp.sample,];

      out.temp=cdfDT(Y.temp,L.temp,R.temp,error,n.iter,display=FALSE)

      if(out.temp$max.iter_reached==0) {break}
    } # ending repeat loop

    # non-parametric weights (Shen) for cox regression
    P.obs.NP.temp=out.temp$P.K

    weights.np.temp=1/P.obs.NP.temp
    ### truncating weights ###
    weights.np.temp[which(weights.np.temp>trunc.weight)]=trunc.weight
    ### # # # # # # # # #  ###

    # updating data set to include bootstrapped observations
    data.temp=data.frame(data[temp.sample,],weights.np.temp)

    # computing estimates of nonparametric weighted estimator
    beta.boot.np[b,]=coxph(formula,data=data.temp,weights=weights.np.temp)$coefficients
  }
  # standard error
  se.beta.np=apply(beta.boot.np,2,sd);

  # If bootstrap not requested, return normal confidence intervals
  if(CI.boot==FALSE) {
    CI.lower=beta.np-1.96*se.beta.np; CI.upper=beta.np+1.96*se.beta.np;
    CI.beta.np=round(cbind(CI.lower,CI.upper),3)
  }
  # If bootstrap not requested, return p-values based on normality assumption
  if(pvalue.boot==FALSE)  {
    Test.statistic=(beta.np/se.beta.np)^2;
    p.value=round(2*(1-pnorm(abs(beta.np/se.beta.np))),4)
  }

  # computing 95% confidence intervals
  if(CI.boot==TRUE)
  {
    CI.beta.np=matrix(0,nrow=p,ncol=2)
    for(k in 1:p) CI.beta.np[k,]=round(c(quantile(beta.boot.np[,k],seq(0,1,0.025))[2],quantile(beta.boot.np[,k],seq(0,1,0.025))[40]),3)
  }



  if(pvalue.boot==TRUE)
  {
    B1=B.pvalue.boot;
    B2=B.pvalue.se.boot;
    beta.boot.np1=matrix(0,nrow=B1,ncol=p); beta.boot.np2=matrix(0,nrow=B2,ncol=p); beta.boot.np.sd1=matrix(0,nrow=B1,ncol=p)
    for(b1 in 1:B1) {
      repeat{ # creating repeat loop in case Shen algorithm fails
        if(time.var==FALSE) {temp.sample1=sort(sample(n,replace=TRUE))};
        if(time.var==TRUE) {
          temp.obs1=sort(sample(n.subject,replace=TRUE))
          temp.sample1=unlist(sapply(temp.obs1,subjects,FUN=function(x,y) which(x==y)))
        }
        Y.temp1=Y[temp.sample1]; L.temp1=L[temp.sample1]; R.temp1=R[temp.sample1];
        if(p==1) X.temp1=X[temp.sample1];
        if(p>=2) X.temp1=X[temp.sample1,];

        out.temp1=cdfDT(Y.temp1,L.temp1,R.temp1,error,n.iter,display=FALSE)


        if(out.temp1$max.iter_reached==0) {break}
      } # ending repeat loop

      P.obs.NP.temp1=out.temp1$P.K
      weights.np.temp1=1/P.obs.NP.temp1
      ### truncating weights ###
      weights.np.temp1[which(weights.np.temp1>trunc.weight)]=trunc.weight
      ### # # # # # # # # #  ###

      # updating data set to include bootstrapped observations
      data.temp1=data.frame(data[temp.sample1,],weights.np.temp1)
      # computing estimates of nonparametric weighted estimator
      beta.boot.np1[b1,]=coxph(formula,data=data.temp1,weights=weights.np.temp1)$coefficients



      # The loop below is to compute the standard error of each bootstrap estimate
      for(b2 in 1:B2) {
        repeat{ # creating repeat loop in case Shen algorithm fails
          if(time.var==FALSE) {temp.sample2=sort(sample(temp.sample1,replace=TRUE))};
          if(time.var==TRUE) {
            temp.obs2=sort(sample(temp.obs1,replace=TRUE))
            temp.sample2=unlist(sapply(temp.obs2,subjects,FUN=function(x,y) which(x==y)))
          }
          Y.temp2=Y[temp.sample2]; L.temp2=L[temp.sample2]; R.temp2=R[temp.sample2];
          if(p==1) X.temp2=X[temp.sample2];
          if(p>=2) X.temp2=X[temp.sample2,];

          out.temp2=cdfDT(Y.temp2,L.temp2,R.temp2,error,n.iter,display=FALSE)

          if(out.temp2$max.iter_reached==0) {break}
        } # ending repeat loop

        P.obs.NP.temp2=out.temp2$P.K
        weights.np.temp2=1/P.obs.NP.temp2
        ### truncating weights ###
        weights.np.temp2[which(weights.np.temp2>trunc.weight)]=trunc.weight
        ### # # # # # # # # #  ###


        # updating data set to include bootstrapped observations
        data.temp2=data.frame(data[temp.sample2,],weights.np.temp2)
        # computing estimates of nonparametric weighted estimator
        beta.boot.np2[b2,]=coxph(formula,data=data.temp2,weights=weights.np.temp2)$coefficients
      }
      beta.boot.np.sd1[b1,]=apply(beta.boot.np2,2,"sd")
    }



    # computing test statistics, bootstrapped test statistics, and p-values
    test_statistic.beta.np=numeric(p)
    test_statistic.beta.np.boot=matrix(0,nrow=B1,ncol=p)
    p.value=numeric(p)
    for(k in 1:p) {
      test_statistic.beta.np[k]=(beta.np[k]/se.beta.np[k])^2
      test_statistic.beta.np.boot[,k]=((beta.boot.np1[,k]-beta.np[k])/beta.boot.np.sd1[,k])^2
      p.value[k]=round(length(which(test_statistic.beta.np.boot[,k]>test_statistic.beta.np[k]))/B1,4)
    }
    Test.statistic=test_statistic.beta.np
  }
  # rounding output
  beta.np=round(beta.np,4);
  se.beta.np=round(se.beta.np,4);
  Test.statistic=round(Test.statistic,2)
  p.value[which(p.value==0)]="<.0005"


  results.beta=noquote(cbind(beta.np,se.beta.np,CI.beta.np,Test.statistic,p.value))
  rownames(results.beta)=colnames(X); colnames(results.beta)=c("Estimate","SE","CI.lower","CI.upper","Wald statistic","p-value")

  weights="print option not requested";
  if(print.weights==TRUE) weights=weights.np;

  cat("number of observations read:", nrows.data, "\n")
  cat("number of observations used:", nrows.data.omit, "\n\n")
  if((CI.boot==TRUE)&(pvalue.boot==FALSE)) return(list(results.beta=results.beta,CI="Bootstrap",p.value="Normal approximation",weights=weights));
  if((CI.boot==FALSE)&(pvalue.boot==TRUE)) return(list(results.beta=results.beta,CI="Normal approximation",p.value="Bootstrap",weights=weights));
  if((CI.boot==TRUE)&(pvalue.boot==TRUE))  return(list(results.beta=results.beta,CI="Bootstrap",p.value="Bootstrap",weights=weights));
  if((CI.boot==FALSE)&(pvalue.boot==FALSE))  return(list(results.beta=results.beta,CI="Normal approximation",p.value="Normal approximation",weights=weights));


}