JSS Example Code"

library(knitr)
opts_chunk$set(out.extra='style="display:block; margin: auto"', fig.align="center")

This is the example code for Dunkler D, Sauerbrei W, Heinze G (2016). Global, Parameterwise and Joint Shrinkage Factor Estimation. Journal of Statistical Software. 69(8), 1-19. http://dx.doi.org/10.18637/jss.v069.i08.

Example R Code

################################################################################
### R code for
### "Global, Parameterwise and Joint Shrinkage Factor Estimation"
### written by Daniela Dunkler, March 2016
################################################################################

## works with R 3.2.3 & shrink-package 1.2.1

## load the packages
library("shrink")
library("survival")
library("mfp")
library("aod")                                                  # for wald-test

sessionInfo()

Section 2.1: Deep Vein Thrombosis Study

## Section 2.1: Deep Vein Thrombosis Study
## load data
data("deepvein")

# number of observations, events, median observation time, and names of
# variables
nrow(deepvein)

deepvein$status2 <- abs(deepvein$status-1)
survfit(Surv(time, status2) ~ 1, data = deepvein)

table(deepvein$status)
round(100*table(deepvein$status)/nrow(deepvein), 2)

sort(names(deepvein))

Section 2.2: Breast Cancer Study

## Section 2.2: Breast Cancer Study
## load data
data("GBSG")


# number of observations, events, median observation time, and names of
# variables
nrow(GBSG)

table(GBSG$cens)
round(100*table(GBSG$cens)/nrow(GBSG), 2)

GBSG$cens2 <- abs(GBSG$cens-1)
# median observation time is given in days here
survfit(Surv(rfst, cens2) ~ 1, data = GBSG)
# median observation time in months
1645 / 30.5


sort(names(GBSG))

Section 5.1: Deep Vein Thrombosis Study

## Section 5.1: Deep Vein Thrombosis Study
# set the reference level for all categorical variables
deepvein$sex <- relevel(deepvein$sex, ref = "female")
deepvein$fiimut <- relevel(deepvein$fiimut, ref = "absent")
deepvein$fvleid <- relevel(deepvein$fvleid, ref = "absent")
deepvein$loc <- relevel(deepvein$loc, ref = "PE")
# contrasts(deepvein$loc)


## fit full model, and compute shrinkage factors (jackknife estimates and dfbeta
## approximations)
fitfull <- coxph(Surv(time, status) ~ log2ddim + bmi + durther + age +
                 sex + loc + fiimut + fvleid, data = deepvein, x = TRUE)
summary(fitfull)


## wald-test for categorical predictor "loc"
wald.test(Sigma = vcov(fitfull)[c("locdistal", "locproximal"),
          c("locdistal", "locproximal")],
          b = fitfull$coeff[c("locdistal", "locproximal")], Terms = 1:2)


shrink(fitfull, type = "global", method = "jackknife")



## apply backward elimination to full model*, and compute shrinkage factors to
## selected model
## * dummy variables of loc are jointly tested and selected
fitd <- step(fitfull, direction = "backward")
print(fitd)


## wald-test for categorical predictor "loc"
wald.test(Sigma = vcov(fitd)[c("locdistal", "locproximal"),
          c("locdistal", "locproximal")],
          b = fitd$coeff[c("locdistal", "locproximal")], Terms = 1:2)



shrink(fitd, type = "global", method = "jackknife")
pwsf <- shrink(fitd, type = "parameterwise", method = "jackknife")
pwsf

sqrt(diag(pwsf$ShrinkageFactorsVCOV))

shrink(fitd, type = "parameterwise", method = "jackknife",
       join = list(c("locdistal", "locproximal")))


## dummy variables of loc are separately tested and selected
## generate dummy variables, fit full model and then apply backward elimination
deepvein <- cbind(deepvein, model.matrix( ~ loc, data = deepvein)[, -1])
fitfull2 <- coxph(Surv(time, status) ~ log2ddim + bmi + durther + age +
                  sex + locdistal + locproximal + fiimut + fvleid,
                  data = deepvein, x = TRUE)  # fitfull2 is identical to fitfull

fitd2 <- step(fitfull2, direction = "backward")
summary(fitd2)

shrink(fitd2, type = "global", method = "jackknife")
shrink(fitd2, type = "parameterwise", method = "jackknife")



## Table 2
t2_jack <- c(shrink(fitd, type = "global", method = "jackknife")$ShrinkageFactors,
             pwsf$ShrinkageFactors,
             shrink(fitd, type = "parameterwise", method = "jackknife",
                    join = list(c("locdistal", "locproximal")))$ShrinkageFactors,
             system.time(shrink(fitd, type = "parameterwise", method = "jackknife"))[1])

t2_dfbeta <- c(shrink(fitd, type = "global", method = "dfbeta")$ShrinkageFactors,
               shrink(fitd, type = "parameterwise", method = "dfbeta")$ShrinkageFactors,
               shrink(fitd, type = "parameterwise", method = "dfbeta",
                      join = list(c("locdistal", "locproximal")))$ShrinkageFactors,
               system.time(shrink(fitd, type = "parameterwise", method = "dfbeta"))[1])

t2 <- cbind(Jackknife = t2_jack, DFBETA = t2_dfbeta)

Table2 <- cbind(t2, round((t2[, 2] - t2[, 1]) / t2[, 1] * 100, 1))
Table2[, 1:2] <- round(Table2[, 1:2], 4)
dimnames(Table2)[[2]][3] <- "Relative difference"
Table2 <- rbind(Table2[1,], rep(NA, 3), Table2[2:5,], rep(NA, 3), Table2[6:8,],
                rep(NA, 3), Table2[10,])
dimnames(Table2)[[1]][c(1:2, 7, 10, 12)] <- c("Global shrinkage",
          "Parameterwise shrinkage", "Joint shrinkage", "loc", "Computing time")
Table2

# max. difference
max(abs(Table2[1:10, 1] - Table2[1:10, 2]), na.rm=TRUE)



## Figure 1: Data simulated from deep vein thrombosis study: Comparison of Jackknife and
##           DFBETA-approximated global shrinkage factors for selected and unselected
##           models.

deepvein0 <- subset(deepvein, status == 0)
deepvein1 <- subset(deepvein, status == 1)

n0 <- nrow(deepvein0)
n1 <- nrow(deepvein1)

ratio <- 0.2                                     # based on n1 / n0
## Note: Running this code is time-consuming, since 2 * B * S data sets are analyzed.
## size <- seq(from = 200, to = 2000, length.out = 19)      # based on nrow(deepvein)
# B <- 100
## You may want to reduce B and length.out of size: e.g.
## size <- seq(from = 200, to = 2000, length.out = 4)
## B <- 3
#
# S <- length(size)
# 
# shrinkGJ <- shrinkGD <- shrinkGJsel <- shrinkGDsel <- matrix(NA, nrow = B, ncol = S)
# set.seed(954681)
# 
# for (s in 1:S) {
#   cat("\n", s)
#   for (b in 1:B) {
#     cat(".")
#     data <- rbind(deepvein0[sample(x = 1:n0, size = size[s]*(1-ratio), replace = TRUE),],
#                   deepvein1[sample(x = 1:n1, size = size[s]*ratio, replace = TRUE),])
#     
#     fitfull <- coxph(Surv(time, status) ~ log2ddim + bmi + durther + age + sex + loc +
#                        fiimut + fvleid, data = data, x = TRUE)
#     fitsel <- step(fitfull, direction = "backward", trace = 0)
#     
#     if (!is.null(fitsel$coefficients)) {                     # no null model selected
#       shrinkGJsel[b, s] <- shrink(fitsel, type = "global", method = "jackknife")$ShrinkageFactors
#       shrinkGDsel[b, s] <- shrink(fitsel, type = "global", method = "dfbeta")$ShrinkageFactors
#     }
#     
#     
#     fit <- coxph(Surv(time, status) ~ log2ddim + sex + loc, data = data, x = TRUE)
#     
#     shrinkGJ[b, s] <- shrink(fit, type = "global", method = "jackknife")$ShrinkageFactors
#     shrinkGD[b, s] <- shrink(fit, type = "global", method = "dfbeta")$ShrinkageFactors
#   }
# }
## In some smaller data sets (and especially when shrinkage factors are estimated with the
## Jackknife method) there might be issues with convergence in individual data sets.
## However, coxph will issue a warning.
# 
# 
# shrinkGDselm <- apply(shrinkGDsel, 2, median)
# shrinkGJselm <- apply(shrinkGJsel, 2, median)
# 
# cbind(n = size, Diff_J_D_sel = shrinkGJselm - shrinkGDselm)
# 
# shrinkGDm <- apply(shrinkGD, 2, median)
# shrinkGJm <- apply(shrinkGJ, 2, median)
# 
# cbind(n = size, Diff_J_D = shrinkGJm - shrinkGDm)
# 
# xrange <- c(0.5, 1)
## xrange <- range(shrinkGDselm, shrinkGJselm, shrinkGDm, shrinkGJm)

## pdf("compJvsD.pdf", width = 6, height = 4)
# par(oma = c(0, 0, 0.5, 0.5), mar = c(3.5, 4, 0, 0))
# plot(range(size), xrange, type = "n", las = 1, xlab = "", ylab = "", xaxt = "n")
# axis(1, at = size)
# mtext(side = 1, text = "Sample size", line = 2.3)
# mtext(side = 2, text = "Global shrinkage factor", line = 2.8)
# 
# points(size, shrinkGDselm, pch = 3, col = "darkolivegreen4", cex = 1.5)
# points(size, shrinkGJselm, pch = 1, col = "darkgoldenrod3", cex = 1.3)
# lines(size, shrinkGDselm, lty = 2, col = "darkolivegreen4", lwd = 2)
# lines(size, shrinkGJselm, lty = 3, col = "darkgoldenrod3", lwd = 2)
# 
# points(size, shrinkGDm, pch = 3, col = "firebrick3", cex = 1.5)
# points(size, shrinkGJm, pch = 1, col = "dodgerblue3", cex = 1.3)
# lines(size, shrinkGDm, lty = 2, col = "firebrick3", lwd = 2)
# lines(size, shrinkGJm, lty = 3, col = "dodgerblue3", lwd = 2)
# 
# legend(x = 1600, y = 0.62, legend = c("Jackknife", "DFBETA"), lwd = 2, title = "unselected",
#        col = c("dodgerblue3", "firebrick3"), inset = 0.05, bty = "n", pch = c(1, 3),
#        text.col = c("dodgerblue3", "firebrick3"), title.col = "black")
# 
# legend(x = 1000, y = 0.62, legend = c("Jackknife", "DFBETA"), lwd = 2, title = "selected",
#        col = c("darkgoldenrod3", "chartreuse4"), inset = 0.05, bty = "n", pch = c(1, 3),
#        text.col = c("darkgoldenrod3", "chartreuse4"), title.col = "black")
## dev.off()

Section 5.2: Breast Cancer Study

## Section 5.2: Breast Cancer Study
## define predictors as suggested by Sauerbrei (1999, Applied Statistics)
GBSG$enodes <- exp(-0.12*GBSG$posnodal)


# create ordinal dummy-coded variable tumgrad for grades 1 to 3
contrasts(GBSG$tumgrad) <- matrix(c(0, 1, 1, 0, 0, 1), ncol = 2, byrow = FALSE,
                                  dimnames = list(1:3, c("tumgrad1", "tumgrad2")))
contrasts(GBSG$tumgrad)
# for nicer variable names use default dimnames
contrasts(GBSG$tumgrad) <- matrix(c(0, 1, 1, 0, 0, 1), ncol = 2, byrow = FALSE)


## # model selection (backward elimination) and estimation
fitg <- mfp(Surv(rfst, cens) ~ fp(age) + fp(prm) + fp(esm) + fp(tumsize) +
              fp(enodes) + tumgrad + menostat + strata(htreat),
            family = cox, data = GBSG, alpha = 0.05, select = 0.05)
print(fitg)


## Table 3
## (p-values unconditional on the selected degree and power for prm, age, and
## enodes = fitg$pvalues)
varorder <- c("age.1", "age.2", "prm.1", "enodes.1", "tumgrad1.1")

t3_beta_j <- c(NA, round(fitg$coefficients, 3)[varorder])
t3_df <- c(fitg$df.initial[7, ], NA, NA, fitg$df.initial[c(2, 1, 3), ])
t3_p <- c(round(fitg$pvalues[7, "p.null"], 3), NA, NA,
          round(fitg$pvalues[c(2, 1, 3), "p.null"], 3))
t3_pwsf0 <- shrink(fitg, type = "parameterwise", method = "jackknife")
t3_pwsf <- round(c(NA, t3_pwsf0$ShrinkageFactors[varorder]), 3)
t3_pwsfse <- round(c(NA, sqrt(diag(t3_pwsf0$ShrinkageFactorsVCOV))[varorder]), 3)
t3_jointsf0 <- shrink(fitg, type = "parameterwise", method = "jackknife",
                      join=list(c("age.1", "age.2")))
t3_jointsf <- round(c(t3_jointsf0$ShrinkageFactors[varorder[1]], NA, NA,
                      t3_jointsf0$ShrinkageFactors[varorder[-c(1:2)]]), 3)
t3_jointsfse <- round(c(sqrt(diag(t3_jointsf0$ShrinkageFactorsVCOV))["join.age.1"],
                        NA, NA,
                        sqrt(diag(t3_jointsf0$ShrinkageFactorsVCOV))[varorder[-c(1:2)]]), 3)

Table3 <- cbind("beta_j" = t3_beta_j, "df" = t3_df, "p value" = t3_p,
                "PWSF" = t3_pwsf, "PWSF SE" = t3_pwsfse,
                "Joint shrinkage" = t3_jointsf,
                "Joint shrinkage SE" = t3_jointsfse)
dimnames(Table3)[[1]][1] <- "age"
Table3



## compute shrinkage factors
globalsf <- shrink(fitg, type = "global", method = "jackknife")
globalsf
sqrt(globalsf$ShrinkageFactorsVCOV)

pwsf <- shrink(fitg, type = "parameterwise", method = "jackknife")
pwsf
round(sqrt(diag(pwsf$ShrinkageFactorsVCOV)), 3)
round(cov2cor(pwsf$ShrinkageFactorsVCOV), 3)

jointsf <- shrink(fitg, type = "parameterwise", method = "jackknife",
                  join=list(c("age.1", "age.2")))
jointsf
round(sqrt(diag(jointsf$ShrinkageFactorsVCOV)), 3)



## Figure 2: selected model: Log hazard relative to 50 years
# refit model fitg explicitly including the two dummy variables of
# tumgrad
GBSG <- cbind(GBSG, model.matrix(~tumgrad, data = GBSG)[, -1])

fitg <- mfp(Surv(rfst, cens) ~ fp(age) + fp(prm) + fp(esm) + fp(tumsize) +
              fp(enodes) + tumgrad1 + tumgrad2 + menostat + strata(htreat),
            family = cox, data = GBSG, alpha = 0.05, select = 0.05)

globalsf <- shrink(fitg, type = "global", method = "jackknife")
pwsf <- shrink(fitg, type = "parameterwise", method = "jackknife")
jointsf <- shrink(fitg, type = "parameterwise", method = "jackknife",
                  join=list(c("age.1", "age.2")))

# define new data for which prediction is requested
# newdatref is the new data set for the reference observation
age <- 30:75
newdat <- data.frame(age = age, enodes = 1, prm = 1, tumgrad1 = 0,
                     htreat = factor(1))
newdatref <- data.frame(age = 50, enodes = 1, prm = 1, tumgrad1 = 0,
                        htreat = factor(1))


xaxis <- seq(from = min(age), to = max(age), by = 5)


# pdf("plotgbsg.pdf", width = 6, height = 6)
par(fig = c(0, 1, 0, 0.3), new = FALSE, oma = c(0, 0, 0.5, 0.5),
    mar = c(3.5, 4, 0, 0))
hist(GBSG$age[(GBSG$age >= min(age)) & (GBSG$age <= max(age))],
     breaks = seq(from = min(age), to = max(age), by = 2.5), main = "",
     xlim = range(age), xaxs = "r", yaxs = "r", xlab = "", ylab = "",
     axes = FALSE, col = "gray88")
axis(side = 1, at = xaxis)
axis(side = 2, at = seq(from = 0, to = 60, by = 20), las = 2)
mtext(side = 1, text = "Age", line = 2.3)
mtext(side = 2, text = "Frequency", line = 2.8)
box()

par(fig = c(0,1,0.3,1), new = TRUE, oma = c(0, 0, 0.5, 0.5),
    mar = c(1, 4, 0, 0))
plot(range(age), c(-0.05, 0.8), xlab = "", ylab = "", type = "n", yaxs = "i",
     xaxt = "n", yaxt = "n")
axis(side = 1, at = xaxis, labels = rep("", times = length(xaxis)))
axis(side = 2, at = seq(from = 0, to = 0.7, by = 0.1), las = 2,
     labels = seq(from = 0, to = 0.7, by = 0.1))
mtext(side = 2, text = "Log hazard relative to 50 years", line = 2.8)
lines(x = age, y = predict(fitg, newdata = newdat, type = "lp") -
        predict(fitg, newdata = newdatref, type = "lp"), lty = 1, col = "gray25",
      lwd = 2)
lines(x = age, y = predict(globalsf, newdata = newdat, type = "lp") -
        predict(globalsf, newdata = newdatref, type = "lp"), lty = 5,
      col = "forestgreen", lwd = 2)
lines(x = age, y = predict(pwsf, newdata = newdat, type="lp") -
        predict(pwsf, newdata = newdatref, type = "lp"), lty = 2,
      col = "dodgerblue3", lwd = 2)
lines(x = age, y = predict(jointsf, newdata = newdat, type = "lp") -
        predict(jointsf, newdata = newdatref, type = "lp"), lty = 4,
      col = "firebrick3", lwd = 2)

legend("topright", inset = 0.02, bty = "n", lty = c(1, 5, 4, 2),
       title = "SHRINKAGE",
       legend = c("No", "Global", "Joint", "Parameterwise"),
       col = c("gray25", "forestgreen", "firebrick3", "dodgerblue3"), lwd = 2,
       text.col = c("gray25", "forestgreen", "firebrick3", "dodgerblue3"))
# dev.off()


Try the shrink package in your browser

Any scripts or data that you put into this service are public.

shrink documentation built on Nov. 2, 2023, 6:05 p.m.