tests/turnbull.R

options(na.action=na.exclude) # preserve missings
options(contrasts=c('contr.treatment', 'contr.poly')) #ensure constrast type
library(survival)

#
# The test data set from Turnbull, JASA 1974, 169-73.
#
#  status  0=right censored
#          1=exact
#          2=left censored
#
aeq <- function(x,y, ...) all.equal(as.vector(x), as.vector(y), ...)


turnbull <- data.frame( time  =c( 1,1,1, 2,2,2, 3,3,3, 4,4,4),
			status=c( 1,0,2, 1,0,2, 1,0,2, 1,0,2),
			  n   =c(12,3,2, 6,2,4, 2,0,2, 3,3,5))
#
# Compute the K-M for the Turnbull data
#      via a slow EM calculation
#

emsurv <- function(time, status, wt, verbose=T) {
    left.cen <- (status==2)
    if (!any(left.cen)) stop("No left censored data!")
    if (!any(status==1))stop("Must have some exact death times")

    tempy <- Surv(time[!left.cen], status[!left.cen])
    ww <- wt[!left.cen]
    tempx <- factor(rep(1, sum(!left.cen)))
    tfit <- survfit(tempy~tempx, weight=ww)
    if (verbose)
       cat("Iteration 0, survival=", format(round(tfit$surv[tfit$n.event>0],3)),
		       "\n")

    stimes <- tfit$time[tfit$n.event>0]
    ltime <- time[left.cen]
    lwt   <- wt[left.cen]
    tempx <- factor(rep(1, length(stimes) + sum(!left.cen)))
    tempy <- Surv(c(time[!left.cen], stimes),
		  c(status[!left.cen], rep(1, length(stimes))))
    for (iter in 1:4) {
	wt2 <- stimes*0
	ssurv <- tfit$surv[tfit$n.event>0]
	sjump <- diff(c(1, ssurv))
	for (j in 1:(length(ltime))) {
	    k <- sum(ltime[j]>=stimes)   #index of the death time
	    if (k==0)
		stop("Left censored observation before the first death")
	    wt2[1:k] <- wt2[1:k] + lwt[j]*sjump[1:k] /(ssurv[k]-1)
	    }
	tfit <- survfit(tempy~tempx, weight=c(ww, wt2))
	if (verbose) {
	   cat("Iteration", iter, "survival=",
		 format(round(tfit$surv[tfit$n.event>0],3)),  "\n")
	   cat("             weights=", format(round(wt2,3)), "\n")
	   }
	}
    survfit(tempy ~ tempx, weights=c(ww, wt2), robust=FALSE)
    }

temp <-emsurv(turnbull$time, turnbull$status, turnbull$n)
print(summary(temp))
# First check, use the data from Turnbull, JASA 1974, 169-173.

tdata <- data.frame(time  =c(1,1,1,2,2,2,3,3,3,4,4,4),
		    status=rep(c(1,0,2),4),
		    n     =c(12,3,2,6,2,4,2,0,2,3,3,5))

tfit <- survfit(Surv(time, time, status, type='interval') ~1, tdata, weight=n)
all.equal(round(tfit$surv,3), c(.538, .295, .210, .095))


# Second check, compare to a reversed survival curve
# This is not as simple a test as one might think, because left and right
#  censored observations are not treated symmetrically by the routine:
#  time <= y for left and time> y for right (this is to make the routine
#  correct for the common situation of panel data).
# To get equivalence, make the left censoreds happen just a little bit
#  earlier.  The left-continuous/right-continuous shift is also a bother.
#
test1 <- data.frame(time=  c(9, 3,1,1,6,6,8),
                    status=c(1,NA,1,0,1,1,0),
                    x=     c(0, 2,1,1,1,0,0))
fit1 <- survfit(Surv(time, status) ~1, test1)
temp <- ifelse(test1$status==0, 4.99,5) - test1$time
fit2 <- survfit(Surv(temp, status, type='left') ~1, test1)

all.equal(round(fit1$surv[1:2],5), round(1-fit2$surv[3:2],5))

rm(tdata, tfit, fit1, temp, fit2)
#
# Create a data set similar to the one provided by Al Zinsmeister
#  It is a hard test case for survfit.turnbull
#
time1 <- c(rep(0,100), rep(1,200), 100, 200, 210, 220,
           rep(365,100), rep(366,5), 731:741)

time2 <- c((1:100)*3,  10+1:100, rep(365:366, c(60,40)), NA, 500, NA, 450,
           rep(730,90), rep(NA,10), c(528,571,691,730,731),
           NA, 1095:1099, NA, 1400, 1200, 772, 1461)

zfit <- survfit(Surv(time1, time2, type='interval2') ~1)

#
# There are 100 intervals of the form (0,x) where x is from 3 to 300,
#  and 200 more of the form (1,x) where x is from 11 to 366.  These
#  lead to a mass point in the interval (1,3), which is placed at 2.
#  The starting estimate has far too little mass placed here, and it takes
#  the EM a long time to realize that most of the weight for the first 300
#  subjects goes here.  With acceleration, it takes 16 iterations, without
#  it takes >40.  (On Al's orginal data, without accel still wasn't there after
#  165 iters!)
#
# The next 4 obs give rise to potential jumps at 100.5, 200.5, 211.5, and
#  221.  However, the final estimate has no mass at all on any of these.
#  Assume mass of a,b, and c at 2, 100.5 and 365.5, and consider the 
#  contributions: 
#    123 obs that overlap a only
#    137 obs that overlap a and b
#     40 obs that overlap a, b, c
#      1 obs that overlap b, c
#    108 obs that overlap c   (200, 210,200, 365, and 366 starting points)
#  For some trial values of a,b,c, compare the loglik to that of (a+b),0,c
#   First one: a^123 (a+b)^137 (a+b+c)^40 (b+c) c^108
#   Second:    (a+b)^123 (a+b)^137 (a+b+c)^40 c c^108
#   Likelhood improves if (1 + b/a)^123 > 1+ b/c, which is true for almost
#     all a and c.  In particular, at the solution a and c are approx .7 and
#     .18, respectively.
#
# The program can't see this coming, of course, and so iterates towards a
#  KM with epsilon sized jumps at 100.5, 200.5, and 211.5.  Whether these
#  intervals should be removed during iteration, as detected, is an open
#  question for me.
#
#
# True solution: mass points at 2, 365.5, 408, and 756.5, of sizes a, b, c, d
# Likelihood:    a^260 (a+b)^40 (b+c)^92 (b+c+d)^12 c^5 d^11
# Solution: a=0.6958, b=0.1674, c=0.1079, d=0.0289

tfun <- function(x) {
    if (length(x) ==3) x <- c(x, .03)
    x <- x/sum(x)  #make probabilities sum to 1
    loglik <- 260*log(x[1]) + 40*log(x[1]+x[2]) + 92*log(x[2] + x[3]) +
                       12*log(x[2]+x[3]+x[4]) + 5*log(x[3]) + 11*log(x[4])
    -loglik  #find the max, not the min
    }

nfit <- nlminb(start=c(.7,.15, .1), tfun, lower=0, upper=1)
nparm <- c(nfit$par, .03)
nparm <- nparm / sum(nparm)
zparm <- -diff(c(1, zfit$surv[match(c(2, 365.5, 408, 756.5), zfit$time)]))
aeq(round(tfun(nparm),4), round(tfun(zparm),4))
# .0001 is the tolerance in survfit.turnbull

rm(tfun, nfit, nparm, zparm, time1, time2, zfit) 

Try the survival package in your browser

Any scripts or data that you put into this service are public.

survival documentation built on Aug. 24, 2021, 5:06 p.m.