tests/r_sas.R

options(na.action=na.exclude) # preserve missings
options(contrasts=c('contr.treatment', 'contr.poly')) #ensure constrast type
library(survival)

#
# Reproduce example 1 in the SAS lifereg documentation
#

# this fit doesn't give the same log-lik that they claim
fit1 <- survreg(Surv(time, status) ~ I(1000/(273.2+temp)), imotor,
		subset=(temp>150), dist='lognormal')
summary(fit1)

# This one, with the loglik on the transformed scale (the inappropriate
#   scale, Ripley & Venables would argue) does agree.
# All coefs are of course identical.
fit2 <- survreg(Surv(log(time), status) ~ I(1000/(273.2+temp)), imotor,
		subset=(temp>150), dist='gaussian')


# Give the quantile estimates, which is the lower half of "output 48.1.5"
#  in the SAS 9.2 manual

pp1 <- predict(fit1, newdata=list(temp=c(130,150)), p=c(.1, .5, .9),
		      type='quantile', se=T)
pp2 <- predict(fit1, newdata=list(temp=c(130,150)), p=c(.1, .5, .9),
		      type='uquantile', se=T)
pp1

temp130 <- matrix(0, nrow=3, ncol=6)
temp130[,1] <- pp1$fit[1,]
temp130[,2] <- pp1$se.fit[1,]
temp130[,3] <- pp2$fit[1,]
temp130[,4] <- pp2$se.fit[1,]
temp130[,5] <- exp(pp2$fit[1,] - 1.64*pp2$se.fit[1,])
temp130[,6] <- exp(pp2$fit[1,] + 1.64*pp2$se.fit[1,])
dimnames(temp130) <- list(c("p=.1", "p=.2", "p=.3"),
     c("Time", "se(time)", "log(time)", "se[log(time)]", 
       "lower 90", "upper 90"))
print(temp130)

# A set of examples, copied from the manual pages of SAS procedure
#  "reliability", which is part of their QC product.
#

color <- c("black", "red", "green", "blue", "magenta", "red4",
                        "orange", "DarkGreen", "cyan2", "DarkViolet")
palette(color)
pdf(file='reliability.pdf')

#
# Insulating fluids example
#

# Adding a -1 to the fit just causes the each group to have it's own
# intercept, rather than a global intercept + constrasts.  The strata
# statement allows each to have a separate scale
ffit <- survreg(Surv(time) ~ factor(voltage) + strata(voltage) -1, ifluid)

# Get predicted quantiles at each of the voltages
# By default predict() would give a line of results for each observation,
#  I only want the unique set of x's, i.e., only 4 cases
uvolt <- sort(unique(ifluid$voltage))      #the unique levels
plist <- c(1, 2, 5, 1:9 *10, 95, 99)/100
pred  <- predict(ffit, type='quantile', p=plist,
                 newdata=data.frame(voltage=uvolt))
tfun <- function(x) log(-log(1-x))

matplot(t(pred), tfun(plist), type='l', log='x', lty=1,
        col=1:4, yaxt='n',
        xlab="Predicted", ylab="Quantile")
axis(2, tfun(plist), format(100*plist), adj=1)

kfit <- survfit(Surv(time) ~ voltage, ifluid, type='fleming') #KM fit
for (i in 1:4) {
    temp <- kfit[i]
    points(temp$time, tfun(1-temp$surv), col=i, pch=i)
    }

# Now a table
temp <- array(0, dim=c(4,4,4))  #4 groups by 4 parameters by 4 stats
temp[,1,1] <- ffit$coef         # "EV Location" in SAS manual
temp[,2,1] <- ffit$scale        # "EV scale"
temp[,3,1] <- exp(ffit$coef)    # "Weibull Scale"
temp[,4,1] <- 1/ffit$scale      # "Weibull Shape"
 
temp[,1,2] <- sqrt(diag(ffit$var))[1:4]   #standard error
temp[,2,2] <- sqrt(diag(ffit$var))[5:8] * ffit$scale
temp[,3,2] <- temp[,1,2] * temp[,3,1]
temp[,4,2] <- temp[,2,2] / (temp[,2,1])^2

temp[,1,3] <- temp[,1,1] - 1.96*temp[,1,2]  #lower conf limits 
temp[,1,4]  <- temp[,1,1] + 1.96*temp[,1,2] # upper
# log(scale) is the natural parameter, in which the routine did its fitting
#  and on which the std errors were computed
temp[,2, 3] <- exp(log(ffit$scale) - 1.96*sqrt(diag(ffit$var))[5:8])
temp[,2, 4] <- exp(log(ffit$scale) + 1.96*sqrt(diag(ffit$var))[5:8])

temp[,3, 3:4] <- exp(temp[,1,3:4])
temp[,4, 3:4] <- 1/temp[,2,4:3]

dimnames(temp) <- list(uvolt, c("EV Location", "EV Scale", "Weibull scale",
                                "Weibull shape"),
                       c("Estimate", "SE", "lower 95% CI", "uppper 95% CI"))
print(aperm(temp, c(2,3,1)), digits=5)

rm(temp, uvolt, plist, pred, ffit, kfit) 

#####################################################################
# Turbine cracks data
crack2 <- with(cracks, data.frame(day1=c(NA, days), day2=c(days, NA),
                                  n=c(fail, 167-sum(fail))))
cfit <- survreg(Surv(day1, day2, type='interval2') ~1, 
                dist='weibull', data=crack2, weights=n)

summary(cfit)
#Their output also has Wiebull scale = exp(cfit$coef), shape = 1/(cfit$scale)

# Draw the SAS plot
#  The "type=fleming" argument reflects that they estimate hazards rather than
#  survival, and forces a Nelson-Aalen hazard estimate
#
plist <-  c(1, 2, 5, 1:8 *10)/100
plot(qsurvreg(plist, cfit$coef, cfit$scale), tfun(plist), log='x',
     yaxt='n', type='l',
     xlab="Weibull Plot for Time", ylab="Percent")
axis(2, tfun(plist), format(100*plist), adj=1)

kfit <- survfit(Surv(day1, day2, type='interval2') ~1, data=crack2,
                weight=n, type='fleming')
# Only plot point where n.event > 0 
# Why?  I'm trying to match them.  Personally, all should be plotted.
who <- (kfit$n.event > 0)
points(kfit$time[who], tfun(1-kfit$surv[who]), pch='+')
points(kfit$time[who], tfun(1-kfit$upper[who]), pch='-')
points(kfit$time[who], tfun(1-kfit$lower[who]), pch='-')

text(rep(3,6), seq(.5, -1.0, length=6), 
         c("Scale", "Shape", "Right Censored", "Left Censored", 
           "Interval Censored", "Fit"), adj=0)
text(rep(9,6), seq(.5, -1.0, length=6), 
         c(format(round(exp(cfit$coef), 2)),
           format(round(1/cfit$scale, 2)),
           format(tapply(crack2$n, cfit$y[,3], sum)), "ML"), adj=1)

# Now a portion of his percentiles table
#  I don't get the same SE as SAS, I haven't checked out why.  The
#  estimates and se for the underlying Weibull model are the same.
temp <- predict(cfit, type='quantile', p=plist, se=T)
tempse <- sqrt(temp$se[1,])
mat <- cbind(temp$fit[1,], tempse, 
             temp$fit[1,] -1.96*tempse, temp$fit[1,] + 1.96*tempse)
dimnames(mat) <- list(plist*100, c("Estimate", "SE", "Lower .95", "Upper .95"))
print(mat)

#
# The cracks data has a particularly easy estimate, so use
# it to double check code
time <- c(crack2$day2[1], (crack2$day1 + crack2$day2)[2:8]/2, 
          crack2$day1[9])
cdf  <- cumsum(crack2$n)/sum(crack2$n)
all.equal(kfit$time, time)
all.equal(kfit$surv, 1-cdf[c(1:8,8)]) 
rm(time, cdf, kfit)


#######################################################
#
# Replacement of valve seats in diesel engines
#   The input data has id, time, and an indicator of whether there was an
#   event at that time: 0=no, 1=yes.  No one has an event at their last time.
#  The input data has two engines with dual failures: 328 loses 2 valves at 
#    time 653, and number 402 loses 2 at time 139.  For each, fudge the first
#    time to be .1 days earlier. 
#
ties <- which(diff(valveSeat$time)==0 & diff(valveSeat$id)==0)
temp <- valveSeat$time
temp[ties] <- temp[ties] - .1
n <- length(temp)
first <- !duplicated(valveSeat$id)
vtemp <- with(valveSeat, data.frame(id =id, 
                                    time1= ifelse(first, 0, c(0, temp[-n])),
                                    time2= temp, status=status))

kfit <- survfit(Surv(time1, time2, status) ~1, vtemp, id=id)

plot(kfit, fun='cumhaz', ylab="Sample Mean Cumulative Failures", xlab='Time')
title("Valve replacement data")

# The summary.survfit function doesn't have an option for printing out
#   cumulative hazards instead of survival --- need to add that
#   so I just reprise the central code of print.summary.survfit
xx <- summary(kfit)
temp <- cbind(xx$time, xx$n.risk, xx$n.event, xx$cumhaz, 
              xx$std.chaz, -log(xx$upper), -log(xx$lower))
dimnames(temp) <- list(rep("", nrow(temp)),
                       c("time", "n.risk", "n.event", "Cum haz", "std.err",
                         "lower 95%", "upper 95%"))
print(temp, digits=2)

# Note that I have the same estimates but different SE's than SAS.  We are using 
#  a different estimator. It's a statistical argument as to which is
#  better (one could defend both sides): SAS the more standard estimate found
#  in the reliability literature, mine the estimate from statistics literature
rm(temp, kfit, xx)
                    
######################################################
# Turbine data, lognormal fit
turbine <- read.table('data.turbine', 
                      col.names=c("time1", "time2", "n"))

tfit <- survreg(Surv(time1, time2, type='interval2') ~1, turbine,
                dist='lognormal', weights=n, subset=(n>0))

summary(tfit)

# Now, do his plot, but put bootstrap confidence bands on it!
#  First, make a simple data set without weights
tdata <- turbine[rep(1:nrow(turbine), turbine$n),]

qstat <- function(data) {
    temp <- survreg(Surv(time1, time2, type='interval2') ~1, data=data,
                    dist='lognormal')
    qsurvreg(plist, temp$coef, temp$scale, dist='lognormal')
    }

{if (exists('bootstrap')) {
    set.seed(1953)  # a good year :-)
    bfit <- bootstrap(tdata, qstat, B=1000)
    bci <- limits.bca(bfit, probs=c(.025, .975))
    }
else {
    values <- matrix(0, nrow=1000, ncol=length(plist))
    n <- nrow(tdata)
    for (i in 1:1000) {
        subset <- sample(1:n, n, replace=T)
        values[i,] <- qstat(tdata[subset,])
        }
    bci <- t(apply(values,2, quantile, c(.05, .95)))
    }
 }
xmat <- cbind(qsurvreg(plist, tfit$coef, tfit$scale, dist='lognormal'),
              bci)


matplot(xmat, qnorm(plist), 
        type='l', lty=c(1,2,2), col=c(1,1,1), 
        log='x', yaxt='n', ylab='Percent', 
        xlab='Time of Cracking (Hours x 100)')
axis(2, qnorm(plist), format(100*plist), adj=1)
title("Turbine Data")
kfit <- survfit(Surv(time1, time2, type='interval2') ~1, data=tdata)
points(kfit$time, qnorm(1-kfit$surv), pch='+')

dev.off()  #close the plot file

Try the survival package in your browser

Any scripts or data that you put into this service are public.

survival documentation built on June 22, 2024, 10:49 a.m.