R/NPcdf.R

Defines functions icfitCalc Aintmap NPcdf

NPcdf <- function(start, end, count){
  # This function is used to build a cdf for one or more experiments
  # with time-to-event data.
  # Three strategies are possible:
  # 1 - build an NPMLE estimator (pkg. interval);
  # 2 - Use a naive estimator with midpoint imputation
  # 3 - Use a naive estimator with endpoint imputation
  # 4 - Use a naive estimator with startpoint imputation
  # Service functions are taken and modified from the 'interval' package
  # Michael P. Fay, Pamela A. Shaw (2010)., Journal of
  # Statistical Software, 36(2), 1-34."
  # Modified on 27/05/2021

  n <- sum(count)

  # CDF (NPMLE)
  tb <- rep(start, count)
  ta <- rep(end, count)
  obj <- getNPMLE(survival::Surv(tb, ta, type = "interval2") ~ 1)

  begInt <- obj$intmap[1,]
  endInt <- obj$intmap[2,]
  objpf <- obj$pf
  if(!is.finite(endInt[1]) & length(endInt) == 1){
    endInt <- c(begInt, endInt)
    begInt <- c(0, begInt)
    objpf <- c(0, objpf)
  }

  cumCount <- objpf*n
  cumProp <- objpf
  cumProp2 <- cumsum(objpf)
  cdfNPMLE <- data.frame(startTime = begInt, endTime = endInt,
                         count = cumCount, pdf = cumProp, cdf = cumProp2)
  cdfNPMLEsum <- cdfNPMLE
  cdfNPMLE <- cdfNPMLE[is.finite(cdfNPMLE$endTime) == T,]


  # CDF (NPMLE) - plotting data
  time <- c(0, as.vector(obj$intmap))
  crit <- c(TRUE, as.vector(attr(obj$intmap, "LRin")))
  S <- c("[0,0]" = 0, cumsum(obj$pf))
  S <- rep(S, each=2)[-2*length(S)]
  sel <- which(duplicated(time))
  sel <- sel[!crit[sel]]
  time[time==Inf] <- max(time)
  t2 <- time[-sel]
  S2 <- S[-sel]

  # Correction 20/06/21. To account for uncensored obs
  # sel <- which(duplicated(time))
  # if(length(sel) == 0){
  #   t2 <- time
  #   S2 <- S
  # } else {
  #   t2 <- time[-sel]
  #   S2 <- S[-sel]
  # }

  Sb2 <- c(0, diff(S2))
  if(length(t2) == 0) {
    t2 <- cdfNPMLE$endTime; S2 <- 0# ; Sb2 <- 0}
  }

  cdfNPMLEg <- data.frame(time = t2, pdf = Sb2, cdf = S2)
  cdfNPMLEg <- cdfNPMLEg[is.finite(cdfNPMLEg$time) == T,]

  # CDF (naive midpoint imputation)
  start <- start[is.finite(end)]
  count <- count[is.finite(end)]
  end <- end[is.finite(end)]
  centre <- (start + end)/2
  tmp <- tapply(count, centre, sum)
  cumCentre <- as.numeric(names(tmp))
  cumCount <- as.numeric(tmp)
  cumProp <- cumCount/n
  cumProp2 <- cumsum(cumProp)
  cdfMI <- data.frame(time = cumCentre, count = cumCount,
              pdf = cumProp, cdf = cumProp2)

  # CDF (naive endpoint imputation)
  tmp <- tapply(count, start, sum)
  tmp2 <- tapply(count, end, sum)
  cumStart <- as.numeric(names(tmp))
  cumEnd <- as.numeric(names(tmp2))
  cumCount <- as.numeric(tmp2)
  cumStart <- cumStart[is.finite(cumEnd)==T]
  cumCount <- cumCount[is.finite(cumEnd)==T]
  cumEnd <- cumEnd[is.finite(cumEnd)==T]
  cumProp <- cumCount/n
  cumProp2 <- cumsum(cumProp)
  cdfEI <- data.frame(time = cumEnd, count = cumCount,
              pdf = cumProp, cdf = cumProp2)

  # CDF (naive startpoint imputation)
  tmp <- tapply(count, start, sum)
  tmp2 <- tapply(count, end, sum)
  cumStart <- as.numeric(names(tmp))
  cumEnd <- as.numeric(names(tmp2))
  cumCount <- as.numeric(tmp2)
  cumStart <- cumStart[is.finite(cumEnd)==T]
  cumCount <- cumCount[is.finite(cumEnd)==T]
  cumEnd <- cumEnd[is.finite(cumEnd)==T]
  cumProp <- cumCount/n
  cumProp2 <- cumsum(cumProp)
  cdfSI <-data.frame(time = cumStart, count = cumCount,
              pdf = cumProp, cdf = cumProp2)
  # if(class(cdfSI) == "try-error") cdfSI <- NA

  # Fh <- Vectorize(function(x) {
  #   # Returns a function for prediction from NPMLE
  #   if(x < 0){
  #     return(NA)
  #   } else {
  #     time <- cdfNPMLEg$time
  #     prop <- cdfNPMLEg$cdf
  #     LT <- max(time[time <= x])
  #     if(LT == max(time)){
  #       return(max(prop))
  #     } else {
  #       posLT <- which(time == LT)
  #       UT <- min(time[time > x])
  #       posUT <- which(time == UT)
  #       pLT <- prop[posLT]
  #       pUT <- prop[posUT]
  #       df <- data.frame(r = c(LT, UT), u = c(pLT, pUT))
  #       val <- predict(lm(u ~ r, data = df), newdata = data.frame(r = x))
  #       return(c(val))
  #     }
  #     }
  # })
  Fh <- function(x, method = "interpolation") {
    # Returns a function for prediction from NPMLE
    val <- 1 - predictCDF(obj, x, method)$S
    return(c(val))
  }

  return(list(Type1 = cdfNPMLE, Type1plot = cdfNPMLEg,
              Type2 = cdfMI, Type3 = cdfEI, Type4 = cdfSI, SurvObj = obj,
              SurvObjSum = cdfNPMLEsum, Fh = Fh, n = n))
}

# Service functions. They are largely taken from the 'interval package' #####
getNPMLE <- function (formula, data,...) {
    ## Most of this function is copied or slightly modified from survfit
    ## Copied starting from here:

    call <- match.call()
    m <- match.call(expand.dots = FALSE)
    m$... <- NULL
    Terms <- terms(formula, "strata")
    ord <- attr(Terms, "order")
        if (length(ord) & any(ord != 1))
            stop("Interaction terms are not valid for this function")
    m$formula <- Terms
    m[[1]] <- as.name("model.frame")
    m <- eval(m, parent.frame())
    n <- nrow(m)

    ## left-hand-side of formula is "response"
        Y <- model.extract(m, "response")
        casewt <- model.extract(m, "weights")
        if (is.null(casewt))
            casewt <- rep(1, n)
        if (!is.null(attr(Terms, "offset")))
            warning("Offset term ignored")
        ll <- attr(Terms, "term.labels")
        if (length(ll) == 0)
            X <- factor(rep(1, n))
        else X <- survival::strata(m[ll])

        ### do a separate fit for each level of the factor
        group <- levels(X)
        nstrata <- length(group)
        sbind <- function(x,y){
            if (is.vector(x) & is.vector(y)){
                 if (is(x, "list") & is(y, "list")){
                     out<-list(x,y)
                 } else out<-c(x,y)
            } else if (is.matrix(x) & is.matrix(y)) out<-cbind(x,y)
            if (!is.null(attr(x,"LRin")) & !is.null(attr(y,"LRin"))){
                xLRin<-attr(x,"LRin")
                yLRin<-attr(y,"LRin")
                attr(out,"LRin")<-cbind(xLRin,yLRin)
            }
            return(out)
        }
        ## change original left-hand-side of formula to list with L and R vectors representing left and right endpoints
        L <- R <- Y[,1]
        R[Y[,3]==0]<-Inf
        L[Y[,3]==2]<-0
        R[Y[,3]==3] <- Y[Y[,3]==3,2]
        Lin <-rep(FALSE, length(L))
        Rin <-rep(TRUE,length(L))
        Lin[L==R] <- TRUE
        Rin[R==Inf] <- FALSE
        Y <- data.frame(L=L,R=R,Lin=Lin,Rin=Rin)
        AI <- Aintmap(L, R, Lin, Rin)

        # Y <- SurvLR(Y)
        # Fit NMPLE for each stratum (da correggere)
        # print(nstrata)
        for (i in 1:nstrata){
             tempout <- icfitCalc(Y$L[X==group[i]],Y$R[X==group[i]],Lin=Y$Lin[X==group[i]],Rin=Y$Rin[X==group[i]])
             #tempout<-icfit.default(Y$L[X==group[i]],Y$R[X==group[i]],Lin=Y$Lin[X==group[i]],Rin=Y$Rin[X==group[i]],...)
             tempout$strata <- length(tempout$pf)
             names(tempout$strata) <- group[i]
             if (i==1){ icout <- tempout
             } else{
                 icout$A<-NULL
                 tempout$A<-NULL
                 icout<-mapply(sbind, icout, tempout)
             }
        }
        class(icout) <- c("icfit")
        if (!is.null(attr(m, "na.action")))
            icout$na.action <- attr(m, "na.action")

    icout$call <- call
    icout
}

Aintmap <- function(L, R, Lin=NULL, Rin=NULL){
    # Finds Turnbull's intervals
    n<-length(L)
    if (is.null(Lin) & is.null(Rin)){
      Lin<-rep(FALSE,n)
      Rin<-rep(TRUE,n)
      Lin[L==R]<-TRUE
      Rin[R==Inf]<-FALSE
    } else if (length(Lin)==1 & length(Rin)==1 & is.logical(Lin) & is.logical(Rin) ){
      Lin<-rep(Lin,n)
      Rin<-rep(Rin,n)
    } else if (length(Lin)!=n | length(Rin)!=n | !all(is.logical(Lin)) | !all(is.logical(Rin)) ){
      stop("Lin and Rin should be either NULL, logical length 1 or length same as L,R")
    }
    if(n != length(R))
      stop("length of L and R must be the same")
    # calculate a small number, eps, to differentiate between e.g.,  [L,R] and (L,R]
    # we will treat, (L,R] as [L+eps,R], and [L,R) as [L,R-eps]
    # since eps is only
    # used in ranking we do not need to make it super small
    # just smaller than the smallest difference
    LRvalues <- sort(unique(c(0,L,R,Inf)))
    eps<- min(diff(LRvalues))/2
    Le<-L
    Re<-R
    Le[!Lin]<-L[!Lin]+eps
    Re[!Rin]<-R[!Rin]-eps
    # let s be the vector of ordered L and R values with
    # R values later when there are ties
    # then intmap are values s[i] and s[i+1] where s[i] is
    # associated with L and s[i+1] is associated with R
    oLR<-order(c(Le,Re+eps/2) )
    # find the Turnbull intervals, or innermost intervals
    # this is the same as the primary reduction of
    ### Aragon and Eberly (1992) J of Computational and Graphical
    ###     Statistics 1:129-140
    # label L=1 and R=2
    Leq1.Req2<-c(rep(1,n),rep(2,n))
    # order and see if an R is followed by an L
    # take difference of Leq1.Req2 after putting them in
    # order, then if the difference is 1 then the R=2 is followed by L=1
    flag<- c(0,diff( Leq1.Req2[oLR] ))
    R.right.of.L<- (1:(2*n))[flag==1]
    intmapR<- c(L,R)[oLR][R.right.of.L]
    intmapL<- c(L,R)[oLR][R.right.of.L - 1]
    intmapRin<- c(Lin,Rin)[oLR][R.right.of.L]
    intmapLin<- c(Lin,Rin)[oLR][R.right.of.L - 1]
    intmap<-matrix(c(intmapL,intmapR),byrow=TRUE,nrow=2)
    attr(intmap,"LRin")<-matrix(c(intmapLin,intmapRin),byrow=TRUE,nrow=2)
    k<-dim(intmap)[[2]]
    Lbracket<-rep("(",k)
    Lbracket[intmapLin]<-"["
    Rbracket<-rep(")",k)
    Rbracket[intmapRin]<-"]"
    intname<-paste(Lbracket,intmapL,",",intmapR,Rbracket,sep="")
    A<-matrix(0,n,k,dimnames=list(1:n,intname))
    intmapLe<-intmapL
    intmapLe[!intmapLin]<-intmapL[!intmapLin]+eps
    intmapRe<-intmapR
    intmapRe[!intmapRin]<-intmapR[!intmapRin]-eps
    for (i in 1:n){
      tempint<- Le[i]<=intmapRe & Re[i]>=intmapLe
      A[i,tempint]<-1
    }

    # previous versions (<=0.9-9.1) did primary reduction twice,
    # once as described in Turnbull (see above) and once as
    # described in Aragon and Eberly (1992, J of Computational and Graphical
    #    Statistics 1:129-140)
    # both do same thing, so we do not need to do it twice

    ## fix error when intmap=(0,Inf) and k=1, previously A was column matrix of 0, should be a column matrix of 1
    if (k==1 & intmap[1,1]==0 & intmap[2,1]==Inf) A[A==0]<-1

    out<-list(A=A,intmap=intmap)
    out
}

icfitCalc <- function(L, R, initfit = NULL, Lin=NULL,
                    Rin = NULL, epsilon = 1e-06, maxit = 10000, ...){
  # epsilon <- 1e-06
  # maxit <- control$maxit
  AI <- Aintmap(L,R,Lin,Rin)
  A <- AI$A
  if (any(apply(A,1,sum)==0)) stop("A row all zeros. Appears that there are some R<L")
  n <- dim(A)[[1]]
  k <- dim(A)[[2]]
  intmap<-AI$intmap
  if (k==1){
      pf<-1
      ### fix error 1/24/2011: needed to change name from numit to count in emout list
      emout <- list(error=0,count=0,converge=TRUE,message="normal convergence")
      anypzero<-FALSE
  } else {
      ### come up with the initial estimates from the initfit option
      ### may be (1) NULL, (2) character vector giving name of function,
      ### or (3) icfit or similar object
      ### If of type (1) or (2) we first convert it to type (3)
      # if(is.null(initfit)) {
      pbar <- apply(A/apply(A, 1, sum), 2, mean)
      initfit<-list(pf=pbar,intmap=intmap)
      ## the following else section is for initfit functions
      # } else if (is.character(initfit) & length(initfit)==1){
      #     ## because some initfit functions will input A and some will
      #     ## input L,R,Lin, and Rin, we input all 5 variables
      #     ## but any initfit function need not use all 5
      #     ## Get options for initfit function from control
      #     initfitOpts<-control$initfitOpts
      #
      #     ## since initfit functions may not know how to interpret Lin=NULL and Rin=NULL create the values
      #     if (is.null(Lin) & is.null(Rin)){
      #         Lin<-rep(FALSE,length(L))
      #         Rin<-rep(TRUE,length(L))
      #         Lin[L==R]<-TRUE
      #         Rin[R==Inf]<-FALSE
      #     }
      #     ## use try function in case initfit function fails
      #     if (is.null(initfitOpts)){
      #         initfit<-try( do.call(initfit,args=list(L=L,R=R,Lin=Lin,Rin=Rin,A=A)) )
      #     } else {
      #         initfit<-try( do.call(initfit,args=c(list(L=L,R=R,Lin=Lin,Rin=Rin,A=A),initfitOpts)) )
      #     }
      #     if (class(initfit)=="try-error"){
      #         warning("initfit was a character, treated as a function name, and when called gave an error so will not be used")
      #         pbar <- apply(A/apply(A, 1, sum), 2, mean)
      #     } else {
      #         if (is.null(initfit$pf)) stop("initfit treated as function and did not produce list with pf element")
      #         ## if the initfit function outputs an intmap, check that it matches
      #         ## what we have already calculated
      #         ## do not check attributes of intmap to allow functions that do not output that
      #         initintmap<-initfit$intmap
      #         pbar<-initfit$pf
      #     }
      #     if (is.null(initfit$intmap)){ initfit<-list(pf=pbar,intmap=intmap)
      #     } else initfit<-list(pf=pbar,intmap=initfit$intmap)
      # }

      ### Check the initfit:
      ## if the initfit has a different intmap but it has some elements that match
      ## (as would happen if the initfit function deleted values from the intmap with 0 mass)
      ## the current intmap then we can still try this initfit,
      ## we use pbar proportional to the initfit$pf values of those intmaps values that match
      if (is.null(initfit$pf) | is.null(initfit$intmap)) stop("initfit should be either a character function name or a list with elements pf and intmap")
      nkeep<- dim(intmap)[[2]]
      pbar<-rep(0,nkeep)
      for (i in 1:nkeep){
          index<-initfit$intmap[1,]==intmap[1,i] & initfit$intmap[2,]==intmap[2,i]
          if (any(index)){
              if (length(initfit$pf[index])>1) stop("initfit has non-unique intmap columns")
              pbar[i]<- initfit$pf[index]
          }
      }
      if (sum(pbar)==0) stop("initfit has no matching intmap elements with L and R")
      pbar<-pbar/sum(pbar)

      ## em is the em-algorithm, at any iteration if the estimate of the probability
      ## mass at any point is lower than the lower.bound, then that probability
      ## mass is set to zero. Then the Kuhn-Tucker conditions are checked, if they are
      ## not met then a small mass is added back to those values set to zero. This is the
      ## polishing methoded of
      ## Gentleman and Geyer (1994, Biometrika, 618-623)

      ## start count keeps a running total of the number of iterations, since
      ## em may be called more than once with a different lower.bound
      em<-function(A,pbar,lower.bound=.01,startcount=1){
          converge<-FALSE
          message<-"normal convergence"
          A.pbar<-as.vector(A %*% pbar)
          if (any(A.pbar==0)) stop("initfit$pf does not have nonzero values associated with needed intmap spaces")
          J1<-matrix(1,n,1)
          Jn<-matrix(1/n,n,1)
          for (i in startcount:maxit){
              tA.div.A.pbar <- t(A/A.pbar)
              newpbar<- drop((tA.div.A.pbar * pbar) %*% Jn)
              d<-drop(tA.div.A.pbar %*% J1)
              # below is an older slower version
              #A.div.A.pbar <- A/A.pbar
              #newpbar<- apply(t(A.div.A.pbar)*pbar,1,mean)
              #d <- apply(A.div.A.pbar, 2, sum)
              u <-  - d + n
              u[newpbar > 0] <- 0
              error <- max(d + u - n)
              if (error<epsilon){
                  pbar<-newpbar
                  converge<-TRUE
                  if(any(u < 0)){
                      message<-"Kuhn-Tucker conditions not met, self-consistent estimator not MLE"
                      #pbar[u<0]<-min(pbar[pbar>0])
                      pbar[u<0]<-lower.bound
                      pbar<-pbar/sum(pbar)
                  }
                  break()
              }
              newpbar[newpbar<lower.bound]<-0
              A.pbar<-as.vector(A %*% newpbar)
              if (any(A.pbar==0)){
                  message<-"lower bound too high"
                  break()}
              pbar<-newpbar
              if (i==maxit) message<-"maxit reached"
          }
          out<-list(A=A,pbar=pbar,count=i,message=message,converge=converge,error=error)
          out
      }

      ## try different values to "polish" the estimates, where if
      ## the jump in the distribution (mass at any iterval) is less than the lower.bound then
      ## set it equal to zero. If it turns out that a jump should not
      ## have been set to zero, the em function checks that and spits out
      ## the last pbar before the jumps were set to zero
      ## (see Gentleman and Geyer, 1994)
      lower.bounds<- 10^(0:ceiling(log10(epsilon)))
      lower.bounds<-lower.bounds[lower.bounds<=max(1/n,epsilon)]
      emout<-list(pbar=pbar,count=0)
      for (lb in lower.bounds){
          emout<-em(A,pbar=emout$pbar,lower.bound=lb,startcount=emout$count+1)
          if (emout$message=="normal convergence") break()
          if (emout$count==maxit) {
              emout$message<-"problem with convergence, increase maxit"
              break()
          }
          #print(emout$message)
      }
      keep<- !emout$pbar==0
      anypzero<-FALSE
      if (!all(keep)){
          LRin<-attr(intmap,"LRin")[,keep]
          intmap<-intmap[,keep]
          attr(intmap,"LRin")<-LRin
          A<-A[,keep]
          anypzero<-TRUE
      }
      pf<-emout$pbar[keep]
  } # end else for k>1
  strata<-length(pf)
  ## if the A matrix corresponding to the non-zero pf values is full rank, then the NPMLE is mixture unique
  ## see Gentleman and Geyer, 1994, Biometrika 618- or Gentleman and Vandal, 2002 Can J Stat, 557-
  ## DO NOT NEED THIS for univariate interval censored data because it will always be mixture unique
  ## munique<-qr(A)$rank==dim(A)[2]
  # if there is only one strata, title describes output:  NPMLE
  names(strata)<-"NPMLE"
  out <- list(A=A, strata=strata, error = emout$error, numit = emout$count, pf = pf, intmap =
      intmap, converge= emout$converge, message= emout$message, anypzero=anypzero)
  class(out)<-c("icfit","list")
  return(out)
}


# `icfit.default` <-
# function(L, R, initfit = NULL, control=icfitControl(), Lin=NULL, Rin=NULL, conf.int=FALSE,...)
# {
#     out <- icfitCalc(L, R, initfit, control, Lin, Rin,...)
#     if (conf.int){
#         if (control$confMethod!="modboot") stop("only modified bootstrap confidence method available")
#         out$CI <- icfitBootCI(L,R,conf.level=control$conf.level, B=control$B, timeEpsilon= control$timeEpsilon, seed=control$seed,
#             messages=control$timeMessage,...)
#     }
#     out
# }
#


# SurvLR <-function(x){
#     ## change Surv object to data.frame with L and R columns
#     ## type=right, left or interval are allowed, type=counting is not
#     type<-attr(x,"type")
#     if (type=="right"){
#         L<-R<-x[,1]
#         R[x[,2]==0]<-Inf
#     } else if (type=="counting") {
#         stop("Surv object type='counting' not supported")
#     } else if (type=="left"){
#         L<-R<-x[,1]
#         L[x[,2]==0]<-0
#     } else if (type=="interval"){
#         L <- R <- x[,1]
#         R[x[,3]==0]<-Inf
#         L[x[,3]==2]<-0
#         R[x[,3]==3]<-x[x[,3]==3,2]
#     } else { stop(paste("Surv obj type='",type,"' unrecognized",sep=""))
#     }
#     ## add Lin and Rin to work with right censored data
#     Lin<-rep(FALSE,length(L))
#     Rin<-rep(TRUE,length(L))
#     Lin[L==R]<-TRUE
#     Rin[R==Inf]<-FALSE
#     out<-data.frame(L=L,R=R,Lin=Lin,Rin=Rin)
#     return(out)
# }

# `icfitControl`<-function (epsilon = 1e-06, maxit = 10000, initfitOpts=NULL,
#       conf.level=.95, B=200, confMethod="modboot", seed=19439101, timeEpsilon=1e-06, timeMessage=TRUE)
# {
#     if (!is.numeric(epsilon) || epsilon <= 0)
#         stop("value of 'epsilon' must be > 0")
#     if (!is.numeric(maxit) || maxit <= 0)
#         stop("maximum number of iterations must be > 0")
#     if (!is.numeric(conf.level) & (conf.level>=1 | conf.level<=0))
#         stop("conf.level must be between 0 and 1")
#     if (!is.numeric(B)  || B<=10)
#         stop("B must be at least 11")
#
#     list(epsilon = epsilon, maxit = maxit, initfitOpts= initfitOpts, conf.level=conf.level,
#         B=B, confMethod=confMethod, seed=seed, timeEpsilon=timeEpsilon, timeMessage=timeMessage)
# }
OnofriAndreaPG/drcte documentation built on April 14, 2025, 10:44 a.m.