tests/r_sas.R
In survival: Survival Analysis

options(na.action=na.exclude) # preserve missings
options(contrasts=c('contr.treatment', 'contr.poly')) #ensure constrast type
library(survival)

#
# Reproduce example 1 in the SAS lifereg documentation
#

# this fit doesn't give the same log-lik that they claim
fit1 <- survreg(Surv(time, status) ~ I(1000/(273.2+temp)), imotor,
		subset=(temp>150), dist='lognormal')
summary(fit1)

# This one, with the loglik on the transformed scale (the inappropriate
#   scale, Ripley & Venables would argue) does agree.
# All coefs are of course identical.
fit2 <- survreg(Surv(log(time), status) ~ I(1000/(273.2+temp)), imotor,
		subset=(temp>150), dist='gaussian')


# Give the quantile estimates, which is the lower half of "output 48.1.5"
#  in the SAS 9.2 manual

pp1 <- predict(fit1, newdata=list(temp=c(130,150)), p=c(.1, .5, .9),
		      type='quantile', se=T)
pp2 <- predict(fit1, newdata=list(temp=c(130,150)), p=c(.1, .5, .9),
		      type='uquantile', se=T)
pp1

temp130 <- matrix(0, nrow=3, ncol=6)
temp130[,1] <- pp1$fit[1,]
temp130[,2] <- pp1$se.fit[1,]
temp130[,3] <- pp2$fit[1,]
temp130[,4] <- pp2$se.fit[1,]
temp130[,5] <- exp(pp2$fit[1,] - 1.64*pp2$se.fit[1,])
temp130[,6] <- exp(pp2$fit[1,] + 1.64*pp2$se.fit[1,])
dimnames(temp130) <- list(c("p=.1", "p=.2", "p=.3"),
     c("Time", "se(time)", "log(time)", "se[log(time)]", 
       "lower 90", "upper 90"))
print(temp130)

# A set of examples, copied from the manual pages of SAS procedure
#  "reliability", which is part of their QC product.
#

color <- c("black", "red", "green", "blue", "magenta", "red4",
                        "orange", "DarkGreen", "cyan2", "DarkViolet")
palette(color)
pdf(file='reliability.pdf')

#
# Insulating fluids example
#

# Adding a -1 to the fit just causes the each group to have it's own
# intercept, rather than a global intercept + constrasts.  The strata
# statement allows each to have a separate scale
ffit <- survreg(Surv(time) ~ factor(voltage) + strata(voltage) -1, ifluid)

# Get predicted quantiles at each of the voltages
# By default predict() would give a line of results for each observation,
#  I only want the unique set of x's, i.e., only 4 cases
uvolt <- sort(unique(ifluid$voltage))      #the unique levels
plist <- c(1, 2, 5, 1:9 *10, 95, 99)/100
pred  <- predict(ffit, type='quantile', p=plist,
                 newdata=data.frame(voltage=uvolt))
tfun <- function(x) log(-log(1-x))

matplot(t(pred), tfun(plist), type='l', log='x', lty=1,
        col=1:4, yaxt='n',
        xlab="Predicted", ylab="Quantile")
axis(2, tfun(plist), format(100*plist), adj=1)

kfit <- survfit(Surv(time) ~ voltage, ifluid, type='fleming') #KM fit
for (i in 1:4) {
    temp <- kfit[i]
    points(temp$time, tfun(1-temp$surv), col=i, pch=i)
    }

# Now a table
temp <- array(0, dim=c(4,4,4))  #4 groups by 4 parameters by 4 stats
temp[,1,1] <- ffit$coef         # "EV Location" in SAS manual
temp[,2,1] <- ffit$scale        # "EV scale"
temp[,3,1] <- exp(ffit$coef)    # "Weibull Scale"
temp[,4,1] <- 1/ffit$scale      # "Weibull Shape"
 
temp[,1,2] <- sqrt(diag(ffit$var))[1:4]   #standard error
temp[,2,2] <- sqrt(diag(ffit$var))[5:8] * ffit$scale
temp[,3,2] <- temp[,1,2] * temp[,3,1]
temp[,4,2] <- temp[,2,2] / (temp[,2,1])^2

temp[,1,3] <- temp[,1,1] - 1.96*temp[,1,2]  #lower conf limits 
temp[,1,4]  <- temp[,1,1] + 1.96*temp[,1,2] # upper
# log(scale) is the natural parameter, in which the routine did its fitting
#  and on which the std errors were computed
temp[,2, 3] <- exp(log(ffit$scale) - 1.96*sqrt(diag(ffit$var))[5:8])
temp[,2, 4] <- exp(log(ffit$scale) + 1.96*sqrt(diag(ffit$var))[5:8])

temp[,3, 3:4] <- exp(temp[,1,3:4])
temp[,4, 3:4] <- 1/temp[,2,4:3]

dimnames(temp) <- list(uvolt, c("EV Location", "EV Scale", "Weibull scale",
                                "Weibull shape"),
                       c("Estimate", "SE", "lower 95% CI", "uppper 95% CI"))
print(aperm(temp, c(2,3,1)), digits=5)

rm(temp, uvolt, plist, pred, ffit, kfit) 

#####################################################################
# Turbine cracks data
crack2 <- with(cracks, data.frame(day1=c(NA, days), day2=c(days, NA),
                                  n=c(fail, 167-sum(fail))))
cfit <- survreg(Surv(day1, day2, type='interval2') ~1, 
                dist='weibull', data=crack2, weights=n)

summary(cfit)
#Their output also has Wiebull scale = exp(cfit$coef), shape = 1/(cfit$scale)

# Draw the SAS plot
#  The "type=fleming" argument reflects that they estimate hazards rather than
#  survival, and forces a Nelson-Aalen hazard estimate
#
plist <-  c(1, 2, 5, 1:8 *10)/100
plot(qsurvreg(plist, cfit$coef, cfit$scale), tfun(plist), log='x',
     yaxt='n', type='l',
     xlab="Weibull Plot for Time", ylab="Percent")
axis(2, tfun(plist), format(100*plist), adj=1)

kfit <- survfit(Surv(day1, day2, type='interval2') ~1, data=crack2,
                weight=n, type='fleming')
# Only plot point where n.event > 0 
# Why?  I'm trying to match them.  Personally, all should be plotted.
who <- (kfit$n.event > 0)
points(kfit$time[who], tfun(1-kfit$surv[who]), pch='+')
points(kfit$time[who], tfun(1-kfit$upper[who]), pch='-')
points(kfit$time[who], tfun(1-kfit$lower[who]), pch='-')

text(rep(3,6), seq(.5, -1.0, length=6), 
         c("Scale", "Shape", "Right Censored", "Left Censored", 
           "Interval Censored", "Fit"), adj=0)
text(rep(9,6), seq(.5, -1.0, length=6), 
         c(format(round(exp(cfit$coef), 2)),
           format(round(1/cfit$scale, 2)),
           format(tapply(crack2$n, cfit$y[,3], sum)), "ML"), adj=1)

# Now a portion of his percentiles table
#  I don't get the same SE as SAS, I haven't checked out why.  The
#  estimates and se for the underlying Weibull model are the same.
temp <- predict(cfit, type='quantile', p=plist, se=T)
tempse <- sqrt(temp$se[1,])
mat <- cbind(temp$fit[1,], tempse, 
             temp$fit[1,] -1.96*tempse, temp$fit[1,] + 1.96*tempse)
dimnames(mat) <- list(plist*100, c("Estimate", "SE", "Lower .95", "Upper .95"))
print(mat)

#
# The cracks data has a particularly easy estimate, so use
# it to double check code
time <- c(crack2$day2[1], (crack2$day1 + crack2$day2)[2:8]/2, 
          crack2$day1[9])
cdf  <- cumsum(crack2$n)/sum(crack2$n)
all.equal(kfit$time, time)
all.equal(kfit$surv, 1-cdf[c(1:8,8)]) 
rm(time, cdf, kfit)


#######################################################
#
# Replacement of valve seats in diesel engines
#   The input data has id, time, and an indicator of whether there was an
#   event at that time: 0=no, 1=yes.  No one has an event at their last time.
#  The input data has two engines with dual failures: 328 loses 2 valves at 
#    time 653, and number 402 loses 2 at time 139.  For each, fudge the first
#    time to be .1 days earlier. 
#
ties <- which(diff(valveSeat$time)==0 & diff(valveSeat$id)==0)
temp <- valveSeat$time
temp[ties] <- temp[ties] - .1
n <- length(temp)
first <- !duplicated(valveSeat$id)
vtemp <- with(valveSeat, data.frame(id =id, 
                                    time1= ifelse(first, 0, c(0, temp[-n])),
                                    time2= temp, status=status))

kfit <- survfit(Surv(time1, time2, status) ~1, vtemp, id=id)

plot(kfit, fun='cumhaz', ylab="Sample Mean Cumulative Failures", xlab='Time')
title("Valve replacement data")

# The summary.survfit function doesn't have an option for printing out
#   cumulative hazards instead of survival --- need to add that
#   so I just reprise the central code of print.summary.survfit
xx <- summary(kfit)
temp <- cbind(xx$time, xx$n.risk, xx$n.event, xx$cumhaz, 
              xx$std.chaz, -log(xx$upper), -log(xx$lower))
dimnames(temp) <- list(rep("", nrow(temp)),
                       c("time", "n.risk", "n.event", "Cum haz", "std.err",
                         "lower 95%", "upper 95%"))
print(temp, digits=2)

# Note that I have the same estimates but different SE's than SAS.  We are using 
#  a different estimator. It's a statistical argument as to which is
#  better (one could defend both sides): SAS the more standard estimate found
#  in the reliability literature, mine the estimate from statistics literature
rm(temp, kfit, xx)
                    
######################################################
# Turbine data, lognormal fit
turbine <- read.table('data.turbine', 
                      col.names=c("time1", "time2", "n"))

tfit <- survreg(Surv(time1, time2, type='interval2') ~1, turbine,
                dist='lognormal', weights=n, subset=(n>0))

summary(tfit)

# Now, do his plot, but put bootstrap confidence bands on it!
#  First, make a simple data set without weights
tdata <- turbine[rep(1:nrow(turbine), turbine$n),]

qstat <- function(data) {
    temp <- survreg(Surv(time1, time2, type='interval2') ~1, data=data,
                    dist='lognormal')
    qsurvreg(plist, temp$coef, temp$scale, dist='lognormal')
    }

{if (exists('bootstrap')) {
    set.seed(1953)  # a good year :-)
    bfit <- bootstrap(tdata, qstat, B=1000)
    bci <- limits.bca(bfit, probs=c(.025, .975))
    }
else {
    values <- matrix(0, nrow=1000, ncol=length(plist))
    n <- nrow(tdata)
    for (i in 1:1000) {
        subset <- sample(1:n, n, replace=T)
        values[i,] <- qstat(tdata[subset,])
        }
    bci <- t(apply(values,2, quantile, c(.05, .95)))
    }
 }
xmat <- cbind(qsurvreg(plist, tfit$coef, tfit$scale, dist='lognormal'),
              bci)


matplot(xmat, qnorm(plist), 
        type='l', lty=c(1,2,2), col=c(1,1,1), 
        log='x', yaxt='n', ylab='Percent', 
        xlab='Time of Cracking (Hours x 100)')
axis(2, qnorm(plist), format(100*plist), adj=1)
title("Turbine Data")
kfit <- survfit(Surv(time1, time2, type='interval2') ~1, data=tdata)
points(kfit$time, qnorm(1-kfit$surv), pch='+')

dev.off()  #close the plot file