R/kernel.est.R

Defines functions kernel.est

Documented in kernel.est

#' Estimate the probability in state and restricted mean time in an illness-death
#' model with component-wise censoring
#'
#' This function implements the non-parametric kernel estimator of the probability
#' in state and restricted mean time in state in an
#' illness-death model with component-wise censoring, based on the method of
#' Sun, Huang and Wang (2017).
#'
#' @param dat a dataframe with one row per individual with the variables
#' \code{t1-tm} and \code{x1-xm} where \code{m}
#' is the number of visits and \code{ti} and \code{xi} are the time and status at
#' each visit, and the variables \code{dtime} and \code{dstatus} which are the time
#' and event indicator for death, and the variable \code{nvisits} is the number of visits.
#' \code{xi=1} represents alive and event free
#' (state 1 in a multistate illness-death model), and \code{xi=2} represents alive with event.
#' @param bandwidth specifies the bandwidth to be used for the kernel estimator.
#' This can be selected data-adaptively using the \code{dab()} function.
#' @param tau2 the maximum time that individuals were at risk for a visit,
#' based on the study design.
#' @param prob.times a vector of times at which probability in state will be estimated
#' @param mu.times a vector of restriction times at which restricted mean time
#' in state will be estimated
#' @param boundary specifies how kernel estimation is done in the left boundary region
#' from zero to the bandwidth. The default is \code{boundary.kernel}, meaning a boundary
#'  kernel is used in the left boundary region. Set \code{boundary = 'interpolation'}
#'  to use linear interpolation through the points (0,1) and (h, r) where h is the
#'  bandwidth and r is the kernel estimate at time h.
#' @param kfun specifies the kernel function to be used for estimation. The default
#' is \code{epanechnikov}; other possible values are \code{triweight}, \code{biweight} and \code{uniform}
#' @param std.err If \code{std.err= 'asymptotic'}  or \code{'boot'}, the function
#' calculates the standard error estimates and 95\% confidence intervals for each
#' quantity using the asymptotic or bootstrap estimators. (\code{std.err= 'none'} is the default.)
#' @param B the number of bootstrap samples; the default value of 50 for the sake
#' of computation time, but we recommend increasing it
#' @param boot.seed If \code{boot.seed} is specified, \code{set.seed(boot.seed)}
#'  will be run before generating bootstrap samples, so the samples can be reproduced.
#' @param scale a scaling factor for the restricted mean time in state output. For example,
#' if times are in days and you want the output to reflect restricted mean years in state,
#' set \code{scale = 365.25}.
#' @param weights a vector of weights with length equal to the number of rows in \code{dat}.
#' The default is to assign each observation a weight of one.
#'
#' @return A list with up to two elements: an element called \code{prob.info}  if \code{prob.times} was non-null,
#' and an element called \code{mu.info} if \code{mu.times} was non-null. \code{prob.info} contains
#'  probability in state estimates and \code{mu.info} contains restricted mean time
#'  in state estimates.
#'  The columns in \code{prob.info} are \code{t, p1, p2, p3} for time and
#'  probability in state 1, 2 and 3, respectively. If \code{std.err ='boot'} or
#'  \code{'asymptotic'}, additional columns are added with standard error estimates
#'  and lower and upper limits of the 95\% confidence interval for each estimate.
#'  The columns in \code{mu.info} are analogous.
#' @export
#'
#' @examples mydat <- simdat(50, scale12=1/.0008, scale13=1/.0002, scale23=1/.0016,
#' vital.lfu=c(30.4*36, 30.4*48),
#' visit.schedule = 30.4*c(6, 12, 18, 24, 30, 36, 42, 48), scatter.sd=10)
#' kernel.est(mydat, bandwidth=12*30.4, tau2=30.4*48, prob.times=30.4*48, mu.times=30.4*48,
#' boundary = 'boundary.kernel', kfun='epanechnikov',
#' std.err='none',  scale=12*30.4)
kernel.est <- function(dat, bandwidth, tau2, prob.times=NULL, mu.times=NULL,
                       boundary = 'boundary.kernel', kfun='epanechnikov',
                       std.err='none', B=50, boot.seed = NULL,
                       scale=1, weights=NULL){

  if (!is.null(prob.times)) if (all.equal(sort(prob.times),prob.times)!=T) stop('prob.times must be in ascending order.')
  if (!is.null(mu.times)) if (all.equal(sort(mu.times),mu.times)!=T) stop('mu.times must be in ascending order.')
  if (!(boundary %in% c('boundary.kernel','interpolation'))) stop('The only allowable entries for boundary are "boundary.kernel" and "interpolation".')
  if (bandwidth>tau2/2) stop('The bandwidth must be less than or equal to tau2/2.')
  if (!is.null(weights) & !(length(weights)==nrow(dat))) stop('The number of weights must be equal to the number of rows in dat.')

  if (is.null(weights)) weights<-rep(1, nrow(dat))

  if (kfun == 'epanechnikov') {
    Kq <<- function(x, q) {
      sigk1 <- sqrt(0.2)
      2 / (q + 1) * K1(2 / (q + 1) * (x - (q - 1) / 2)) *
        (1 + ((q - 1) / (q + 1) / sigk1) ^ 2 + 2 / sigk1 ^ 2 * (1 - q) / (1 + q) ^ 2 * x)
      # Could also use  6*(1+x)*(q-x)*((1+q)^(-3))*(1+5*((1-q)/(1+q))^2+10*(1-q)*((1+q)^(-2))*x)
      # Equivalent when x<=q which is the only time we'd use it!
    }
    K1 <<- function(u) {
      0.75 * (1 - u ^ 2) * (abs(u) < 1)
    }
  }
  else if (kfun == 'uniform') {
    Kq <<- function(x, q) {
      (1+q)^(-1)*(1+3*((1-q)/(1+q))^2+6*(1-q)*(1+q)^(-2)*x)*(abs(x)<1)
    }
    K1 <- function(u) 0.5* (abs(u) < 1)
  }
  else if (kfun == 'triweight'){
    Kq <<- function(x, q) {
      140*(1+x)^3*(q-x)^3*(1+q)^(-7)*(1+9*((1-q)/(1+q))^2+18*(1-q)*((1+q)^(-2))*x)*(abs(x)<1)
    }
    K1 <<- function(u) (35/32)*(1-u^2)^3 * (abs(u) < 1)
  }
  else if (kfun == 'biweight'){
    Kq <<- function(x, q) {
      30*(1+x)^2*(q-x)^2*(1+q)^(-5)*(1+7*((1-q)/(1+q))^2+14*(1-q)*((1+q)^(-2))*x)*(abs(x)<1)
    }
    K1 <<- function(u) (15/16)*(1-u^2)^2 * (abs(u) < 1)
  }

  # Prep data
  nvisits<-dat$nvisits[1]
  ID<- 1:nrow(dat)
  X <- dat$dtime
  E <- dat$dstatus
  V.all <- c(t(as.matrix(dat[,paste('t',1:nvisits, sep='')])))
  Y.all <- as.numeric(c(t(as.matrix(dat[,paste('x',1:nvisits, sep='')])))==1)
  ID.all <- rep(1:nrow(dat), each = nvisits)
  Weights.all <- rep(weights, each = nvisits)
  missing <- is.na(V.all)
  V.all <- V.all[!missing]
  Y.all <- Y.all[!missing]
  ID.all <- ID.all[!missing]
  Weights.all <- Weights.all[!missing]
  N<-length(X)

  #####################################################
  # generate survival data
  #####################################################
  fit <- survival::survfit(survival::Surv(X, E)~1)
  Left <- c(0,summary(fit)$time)
  Right <- c(summary(fit)$time,Inf)
  Value <- c(1,summary(fit)$surv)

  #####################################################
  # calculate values at the times of interest
  #####################################################
  if (length(prob.times)>0){
    prob.matrix<-matrix(NA, nrow = length(prob.times), ncol = 3)
    prob.matrix[,1]<-f2(prob.times, v.all = V.all, y.all = Y.all, weights.all = Weights.all,
                        h = bandwidth, N = N, tau2 = tau2, boundary = boundary,
                        Left=Left, Right= Right, Value=Value)
    prob.matrix[,3]<-1-sapply(1:length(prob.times), function (x)
      Sd(prob.times[x], Value, Left, Right))
    prob.matrix[,2]<-1-prob.matrix[,1]-prob.matrix[,3]
  }

  if (length(mu.times)>0){
    rm.matrix<-matrix(NA, nrow = length(mu.times), ncol = 3)
    mu.times2<-c(0, mu.times)
    rm.diff<-sapply(1:length(mu.times), function(x) int_f2(mu.times2[x], mu.times2[x+1],bandwidth, V.all,Y.all,Weights.all,
                                                           N,tau2, boundary, Left, Right, Value))
    rm.matrix[,1]<-cumsum(rm.diff)/scale
    rm.matrix[,3]<-mu.times/scale-sapply(1:length(mu.times), function(i){
      if (mu.times[i]<min(X[E==1])) mu.times[i]/scale
      else summary(fit, rmean = mu.times[i], scale =scale, extend = T)$table[5]
    })
    rm.matrix[,2]<-mu.times/scale-rm.matrix[,1]-rm.matrix[,3]
  }

  if (std.err=='none')  {
    if (length(prob.times)>0){
      prob.matrix<-cbind(prob.times/scale, prob.matrix)
      colnames(prob.matrix)<-c('time','p1','p2','p3')
    }
    if (length(mu.times)>0){
      rm.matrix<-cbind(mu.times/scale, rm.matrix)
      colnames(rm.matrix)<-c('time','mu1','mu2','mu3')
    }
  }
  else if (std.err == 'asymptotic'){
    #####################################################
    # generate longitudinal data
    #####################################################
    xi.all <- xi(V.all,bandwidth, V.all, Y.all, Weights.all, N, tau2 = tau2)
    xi.all2 <- xi(V.all,bandwidth, V.all, 1-Y.all, Weights.all, N, tau2 = tau2)
    lambda.all <- lambda(V.all,bandwidth, V.all, Weights.all, N, tau2 = tau2)
    sd.all <- xi.all
    for(i in 1:length(sd.all))
    {
      sd.all[i] <- Sd(V.all[i], Value, Left, Right)
    }
    #####################################################
    # calculate mu, S_X at terminal time
    #####################################################
    sx2.all <- rep(0,N)
    for(i in 1:N)
    {
      sx2.all[i] <- sum(X>=X[i])/N
    }
    mu2.all <- rep(0,N)
    mu2.2.all <- rep(0,N)
    for(i in 1:N)
    {
      tt <- X[i]
      mu2.all[i] <- sum(Weights.all*sd.all[V.all<tt]*Y.all[V.all<tt]/lambda.all[V.all<tt])/N
      mu2.2.all[i] <- sum(Weights.all*sd.all[V.all<tt]*(1-Y.all)[V.all<tt]/lambda.all[V.all<tt])/N
    }
    mu2.3.all<-mu.times-sapply(1:length(X), function(i)
      summary(fit, rmean =X[i], extend = T)$table[5])
    # The mu2s are NOT scaled!

    #####################################################
    # calculate Phi_i
    #####################################################
    Phi1.1<-Phi2.1<-Phi3.1<-matrix(NA, nrow = N, ncol = length(mu.times))
    Phi1.2<-Phi2.2<-Phi3.2<-matrix(NA, nrow = N, ncol = length(mu.times))
    Phi3<-matrix(NA, nrow = N, ncol = length(mu.times))
    phi3<-matrix(NA, nrow = N, ncol = length(prob.times))
    for(i in 1:N)
    {
      V.i <- V.all[ID.all == i]
      Y.i <- Y.all[ID.all == i]
      sd.i <- sd.all[ID.all == i]
      lambda.i <- lambda.all[ID.all == i]
      xi.i <- xi.all[ID.all == i]
      xi.i2<-xi.all2[ID.all == i]
      weight.i<-weights[ID == i]

      if (length(mu.times)>0){
        for (j in 1:length(mu.times)){
          # Variance of RMTIS1
          Phi1.1[i,j] <- (sum(1/(sx2.all[X<X[i] & X<mu.times[j] & E == TRUE])^2)/N - (1/sx2.all[i])*(X[i]<mu.times[j] & E[i] == 1))*rm.matrix[j,1]*scale +
            (mu2.all[i]/sx2.all[i])*(X[i]<mu.times[j] & E[i] == 1) - sum((mu2.all/(sx2.all)^2)[X<X[i] & X<mu.times[j] & E == TRUE])/N
          Phi2.1[i,j] <- weight.i*sum((sd.i*Y.i/lambda.i)[V.i<mu.times[j]]) #- out
          Phi3.1[i,j] <- weight.i*sum((sd.i*xi.i/lambda.i^2)[V.i<mu.times[j]]) #- out

          # Variance of RMTIS2
          Phi1.2[i,j] <- (sum(1/(sx2.all[X<X[i] & X<mu.times[j] & E == TRUE])^2)/N - (1/sx2.all[i])*(X[i]<mu.times[j] & E[i] == 1))*rm.matrix[j,2]*scale +
            (mu2.2.all[i]/sx2.all[i])*(X[i]<mu.times[j] & E[i] == 1) - sum((mu2.2.all/(sx2.all)^2)[X<X[i] & X<mu.times[j] & E == TRUE])/N
          Phi2.2[i,j] <- weight.i*sum((sd.i*(1-Y.i)/lambda.i)[V.i<mu.times[j]]) #- out
          Phi3.2[i,j] <- weight.i*sum((sd.i*(xi.i2)/lambda.i^2)[V.i<mu.times[j]]) #- out

          #Variance of RMTIS3
          Phi3[i,j] <- (sum(1/(sx2.all[X<X[i] & X<mu.times[j] & E == TRUE])^2)/N - (1/sx2.all[i])*(X[i]<mu.times[j] & E[i] == 1))*rm.matrix[j,3]*scale +
            (mu2.3.all[i]/sx2.all[i])*(X[i]<mu.times[j] & E[i] == 1) - sum((mu2.3.all/(sx2.all)^2)[X<X[i] & X<mu.times[j] & E == TRUE])/N
        }
      }

      if (length(prob.times)>0){
        # variance of sqrt(n)*(probability in state 3)
        for (j in 1:length(prob.times)){
          phi3[i,j]<- -Sd(prob.times[j], Value, Left, Right)*(sum(1/(sx2.all[X<X[i] & X<prob.times[j] & E == TRUE])^2)/N - (1/sx2.all[i])*(X[i]<prob.times[j] & E[i] == 1))
        }
      }
    }

    Phi <-  (Phi1.1 + Phi2.1 - Phi3.1)/scale
    Phi.2 <-  (Phi1.2 + Phi2.2 - Phi3.2)/scale
    Phi3 <- Phi3/scale

    if (length(prob.times)>0){
      # variance of sqrt(n)*(prob in state 1). Same as state 2.
      K22 <- 3/5*(kfun == 'epanechnikov') + 1/2*(kfun == 'uniform') + 350/429*(kfun == 'triweight') + 5/7*(kfun == 'biweight')
      var1 <- (K22*((1-prob.matrix[,3])*prob.matrix[,1]-prob.matrix[,1]^2)/lambda(prob.times,bandwidth, V.all, Weights.all, N, tau2 = tau2))/bandwidth
      var1[prob.matrix[,1]==0]<-0
      prob.matrix<-cbind(prob.matrix, sqrt(var1)/sqrt(N), sqrt(var1)/sqrt(N), apply(phi3, 2, sd)/sqrt(N))
      if (length(prob.times)>1) prob.matrix<-cbind(prob.times/scale, prob.matrix, prob.matrix[,1:3]-1.96*prob.matrix[,4:6], prob.matrix[,1:3]+1.96*prob.matrix[,4:6])
      else prob.matrix <- matrix(c(prob.times/scale, prob.matrix, prob.matrix[,1:3]-1.96*prob.matrix[,4:6], prob.matrix[,1:3]+1.96*prob.matrix[,4:6]), nrow = 1)
      colnames(prob.matrix)<-c('time','p1','p2','p3','p1se', 'p2se','p3se',
                               'p1lower','p2lower','p3lower','p1upper','p2upper','p3upper')
    }
    if (length(mu.times)>0){
      rm.matrix<-cbind(rm.matrix, apply(Phi, 2, sd)/sqrt(N), apply(Phi.2, 2, sd)/sqrt(N), apply(Phi3, 2, sd)/sqrt(N))
      if (length(mu.times)>1) rm.matrix <- cbind(mu.times/scale, rm.matrix, rm.matrix[,1:3]-1.96*rm.matrix[,4:6], rm.matrix[,1:3]+1.96*rm.matrix[,4:6])
      else rm.matrix <- matrix(c(mu.times/scale, rm.matrix, rm.matrix[,1:3]-1.96*rm.matrix[,4:6], rm.matrix[,1:3]+1.96*rm.matrix[,4:6]), nrow = 1)
      colnames(rm.matrix)<-c('time','mu1','mu2','mu3','mu1se', 'mu2se','mu3se',
                             'mu1lower','mu2lower','mu3lower','mu1upper','mu2upper','mu3upper')
    }
  }

  else if (std.err == 'boot'){
    if (length(prob.times)>0) prob.boot<-matrix(NA, nrow = B, ncol = 3*length(prob.times))
    if (length(mu.times)>0) rm.boot<-matrix(NA, nrow = B, ncol = 3*length(mu.times))
    if(!is.null(boot.seed)) set.seed(boot.seed)
    indices<-matrix(sample(nrow(dat), nrow(dat)*B, replace = T), nrow = B)

    for (i in 1:B) {
      dat.bs <- dat[indices[i,],]
      # Prep data
      nvisits<-dat.bs$nvisits[1]
      ID<- 1:nrow(dat.bs)
      X <- dat.bs$dtime
      E <- dat.bs$dstatus
      V.all <- c(t(as.matrix(dat.bs[,paste('t',1:nvisits, sep='')])))
      Y.all <- as.numeric(c(t(as.matrix(dat.bs[,paste('x',1:nvisits, sep='')])))==1)
      ID.all <- rep(1:nrow(dat.bs), each = nvisits)
      Weights.all <- rep(weights, each = nvisits)
      missing <- is.na(V.all)
      V.all <- V.all[!missing]
      Y.all <- Y.all[!missing]
      ID.all <- ID.all[!missing]
      Weights.all <- Weights.all[!missing]
      N<-length(X)

      #####################################################
      # generate survival data
      #####################################################
      fit <- survival::survfit(survival::Surv(X, E)~1)
      Left <- c(0,summary(fit)$time)
      Right <- c(summary(fit)$time,Inf)
      Value <- c(1,summary(fit)$surv)

      #####################################################
      # calculate values at the times of interest
      #####################################################
      if (length(prob.times)>0){
        prob.matrix.boot<-matrix(NA, nrow = length(prob.times), ncol = 3)
        prob.matrix.boot[,1]<-f2(prob.times, v.all = V.all, y.all = Y.all, weights.all = Weights.all, h = bandwidth, N = N, tau2 = tau2, boundary = boundary,
                            Left=Left, Right= Right, Value=Value)
        prob.matrix.boot[,3]<-1-sapply(1:length(prob.times), function (x)
          Sd(prob.times[x], Value, Left, Right))
        prob.matrix.boot[,2]<-1-prob.matrix.boot[,1]-prob.matrix.boot[,3]
        prob.boot[i,]<-t(prob.matrix.boot)
      }
      if (length(mu.times)>0){
        rm.matrix.boot<-matrix(NA, nrow = length(mu.times), ncol = 3)
        mu.times2<-c(0, mu.times)
        rm.diff<-sapply(1:length(mu.times), function(x) int_f2(mu.times2[x], mu.times2[x+1],bandwidth, V.all,Y.all,Weights.all,N,tau2, boundary, Left, Right, Value, warn = F))
        rm.matrix.boot[,1]<-cumsum(rm.diff)/scale
        rm.matrix.boot[,3]<-mu.times/scale-sapply(1:length(mu.times), function(i){
          if (mu.times[i]<min(X[E==1])) mu.times[i]/scale
          else summary(fit, rmean = mu.times[i], scale =scale, extend = T)$table[5]
        })
        rm.matrix.boot[,2]<-mu.times/scale-rm.matrix.boot[,1]-rm.matrix.boot[,3]
        rm.boot[i,]<-t(rm.matrix.boot)
      }
    }
    if (length(prob.times)>0) {
      prob.se<-cbind(matrix(apply(prob.boot,2,sd), ncol = 3, byrow = T),
                matrix(apply(prob.boot,2,function(x) quantile(x,probs = 0.025,na.rm = T)), ncol=3, byrow = T),
                matrix(apply(prob.boot,2,function(x) quantile(x,probs = 0.975,na.rm = T)), ncol=3, byrow = T))
      prob.matrix<-cbind(prob.times/scale, prob.matrix, prob.se)
      colnames(prob.matrix)<-c('time','p1','p2','p3','p1se', 'p2se','p3se',
                               'p1lower','p2lower','p3lower','p1upper','p2upper','p3upper')
    }
    if (length(mu.times)>0) {
      rm.se<-cbind(matrix(apply(rm.boot,2,sd), ncol = 3, byrow = T),
                matrix(apply(rm.boot,2,function(x) quantile(x,probs = 0.025,na.rm = T)), ncol=3, byrow = T),
                matrix(apply(rm.boot,2,function(x) quantile(x,probs = 0.975,na.rm = T)), ncol=3, byrow = T))
      rm.matrix<-cbind(mu.times/scale, rm.matrix, rm.se)
      colnames(rm.matrix)<-c('time','mu1','mu2','mu3','mu1se', 'mu2se','mu3se',
                           'mu1lower','mu2lower','mu3lower','mu1upper','mu2upper','mu3upper')
    }
  }
  if (!is.null(prob.times)&!is.null(mu.times)) return(list(prob.info = data.frame(prob.matrix), mu.info =data.frame(rm.matrix)))
  else if(!is.null(prob.times)) return(list(prob.info = data.frame(prob.matrix)))
  else if(!is.null(mu.times)) return(list(mu.info = data.frame(rm.matrix)))
}
anneae/cwcens documentation built on Aug. 14, 2021, 7:20 p.m.