Nothing
## ----setup, cache=FALSE, echo=FALSE-----------------------------------
library(knitr)
options(replace.assign=FALSE,width=72)
opts_chunk$set(fig.path='figs/key-', cache.path='cache/hdd-',
fig.align='center', dev='pdf', fig.width=3.75,
fig.height=4, out.width="0.47\\textwidth",
fig.show='hold', par=TRUE,
tidy=FALSE, comment=NA)
knit_hooks$set(par=function(before, options, envir){
if (before && options$fig.show!='none') par(mar=c(4,4,1.6,.1),
cex.lab=.95,cex.axis=.9,mgp=c(2,.7,0),tcl=-.3)
}, crop=hook_pdfcrop)
pdf.options(pointsize=12)
oldopt <- options(digits=4)
## ----gpsOFinterest----------------------------------------------------
library(hddplot)
data(golubInfo)
with(golubInfo, table(cancer, tissue.mf))
## ----identify-samples-------------------------------------------------
attach(golubInfo)
## Identify allB samples for that are BM:f or BM:m or PB:m
subsetB <- cancer=="allB" & tissue.mf%in%c("BM:f","BM:m","PB:m")
## Form vector that identifies these as BM:f or BM:m or PB:m
tissue.mfB <- tissue.mf[subsetB, drop=TRUE]
## Separate off the relevant columns of the matrix Golub
data(Golub) # NB: variables (rows) by cases (columns)
GolubB <- Golub[,subsetB]
detach(golubInfo)
## ----Boxplots, echo=FALSE---------------------------------------------
## Display distributions for the first 20 observations
boxplot(data.frame(GolubB[, 1:20])) # First 20 columns (observations)
## Random selection of 20 rows (features)
boxplot(data.frame(GolubB[sample(1:7129, 20), ]))
## ----Boxplots, eval=FALSE---------------------------------------------
# ## Display distributions for the first 20 observations
# boxplot(data.frame(GolubB[, 1:20])) # First 20 columns (observations)
# ## Random selection of 20 rows (features)
# boxplot(data.frame(GolubB[sample(1:7129, 20), ]))
## ----Flawed-scores, eval=TRUE, echo=FALSE-----------------------------
## Uses orderFeatures() (hddplot); see below
ord15 <- orderFeatures(GolubB, cl=tissue.mfB)[1:15]
## Panel A
dfB.ord <- data.frame(t(GolubB[ord15, ]))
## Calculations for the left panel
## Transpose to observations by features
dfB15 <- data.frame(t(GolubB[ord15, ]))
library(MASS)
dfB15.lda <- lda(dfB15, grouping=tissue.mfB)
scores <- predict(dfB15.lda, dimen=2)$x
## Scores for the single PB:f observation
df.PBf <- with(golubInfo,
data.frame(t(Golub[ord15, tissue.mf=="PB:f" & cancer=="allB",
drop=FALSE])))
scores.PBf <- predict(dfB15.lda, newdata=df.PBf, dimen=2)$x
## For comparison: simulated scores
simscores <- simulateScores(nrow=7129, cl=rep(1:3, c(19,10,2)),
cl.other=4, nfeatures=15, seed=41)
# Returns list elements: scores, cl, scores.other & cl.other
## ----misleading-plots, echo=FALSE-------------------------------------
opar <- par(mar=c(4,4,2.6,.1))
## Warning! The plot that now follows may be misleading!
## Use scoreplot(), from the hddplot package
scoreplot(list(scores=scores, cl=tissue.mfB, other=scores.PBf,
cl.other="PB:f"))
## Panel B: Repeat plot, now with random normal data
scoreplot(simscores)
par(opar)
## ----Flawed-scores, eval=FALSE, echo=TRUE-----------------------------
# ## Uses orderFeatures() (hddplot); see below
# ord15 <- orderFeatures(GolubB, cl=tissue.mfB)[1:15]
# ## Panel A
# dfB.ord <- data.frame(t(GolubB[ord15, ]))
# ## Calculations for the left panel
# ## Transpose to observations by features
# dfB15 <- data.frame(t(GolubB[ord15, ]))
# library(MASS)
# dfB15.lda <- lda(dfB15, grouping=tissue.mfB)
# scores <- predict(dfB15.lda, dimen=2)$x
# ## Scores for the single PB:f observation
# df.PBf <- with(golubInfo,
# data.frame(t(Golub[ord15, tissue.mf=="PB:f" & cancer=="allB",
# drop=FALSE])))
# scores.PBf <- predict(dfB15.lda, newdata=df.PBf, dimen=2)$x
# ## For comparison: simulated scores
# simscores <- simulateScores(nrow=7129, cl=rep(1:3, c(19,10,2)),
# cl.other=4, nfeatures=15, seed=41)
# # Returns list elements: scores, cl, scores.other & cl.other
## ----misleading-plots, eval=FALSE-------------------------------------
# opar <- par(mar=c(4,4,2.6,.1))
# ## Warning! The plot that now follows may be misleading!
# ## Use scoreplot(), from the hddplot package
# scoreplot(list(scores=scores, cl=tissue.mfB, other=scores.PBf,
# cl.other="PB:f"))
# ## Panel B: Repeat plot, now with random normal data
# scoreplot(simscores)
# par(opar)
## ----F-stats, results="hide", warning=FALSE, message=FALSE------------
## In the following, B is too small for the simulation to
## give a good indication of behavior in the extreme tail.
## Code should be re-run with B >> 1000.
library(multtest, quietly=TRUE)
cl.mfB <- unname(unclass(tissue.mfB)-1)
## NB: Use unnamed integer vector to identify classes,
## here with values running from 0 to 2
GolubB.maxT <- mt.maxT(GolubB, cl.mfB, test="f", B=1000)
## ----qq-Fstats, eval=FALSE--------------------------------------------
# ## Compare calculated F-statistics with permutation distribution
# qqthin(qf(1-GolubB.maxT$rawp, 2, 28), GolubB.maxT$teststat,
# print.thinning.details = FALSE)
# ## Compare calculated F-statistics with theoretical F-distribution
# qqthin(qf(ppoints(7129), 2, 28), GolubB.maxT$teststat,
# print.thinning.details = FALSE)
# # The theoretical F-distribution gives estimates of quantiles
# # that are too small
# ## NB also the comparison between the permutation distribution
# ## and the theoretical F:
# qqthin(qf(ppoints(7129), 2, 28), qf(1-GolubB.maxT$rawp, 2, 28),
# print.thinning.details = FALSE)
# # qqthin() is a version of qqplot() that thins out points where
# # overlap is substantial, thus giving smaller graphics files.
## ----plot-Fstats, echo=FALSE, fig.width=3, fig.height=3.25, out.width="0.32\\textwidth"----
## Compare calculated F-statistics with permutation distribution
qqthin(qf(1-GolubB.maxT$rawp, 2, 28), GolubB.maxT$teststat,
print.thinning.details = FALSE)
## Compare calculated F-statistics with theoretical F-distribution
qqthin(qf(ppoints(7129), 2, 28), GolubB.maxT$teststat,
print.thinning.details = FALSE)
# The theoretical F-distribution gives estimates of quantiles
# that are too small
## NB also the comparison between the permutation distribution
## and the theoretical F:
qqthin(qf(ppoints(7129), 2, 28), qf(1-GolubB.maxT$rawp, 2, 28),
print.thinning.details = FALSE)
# qqthin() is a version of qqplot() that thins out points where
# overlap is substantial, thus giving smaller graphics files.
## ----F-stats, eval=FALSE----------------------------------------------
# ## In the following, B is too small for the simulation to
# ## give a good indication of behavior in the extreme tail.
# ## Code should be re-run with B >> 1000.
# library(multtest, quietly=TRUE)
# cl.mfB <- unname(unclass(tissue.mfB)-1)
# ## NB: Use unnamed integer vector to identify classes,
# ## here with values running from 0 to 2
# GolubB.maxT <- mt.maxT(GolubB, cl.mfB, test="f", B=1000)
## ----qq-Fstats, eval=FALSE--------------------------------------------
# ## Compare calculated F-statistics with permutation distribution
# qqthin(qf(1-GolubB.maxT$rawp, 2, 28), GolubB.maxT$teststat,
# print.thinning.details = FALSE)
# ## Compare calculated F-statistics with theoretical F-distribution
# qqthin(qf(ppoints(7129), 2, 28), GolubB.maxT$teststat,
# print.thinning.details = FALSE)
# # The theoretical F-distribution gives estimates of quantiles
# # that are too small
# ## NB also the comparison between the permutation distribution
# ## and the theoretical F:
# qqthin(qf(ppoints(7129), 2, 28), qf(1-GolubB.maxT$rawp, 2, 28),
# print.thinning.details = FALSE)
# # qqthin() is a version of qqplot() that thins out points where
# # overlap is substantial, thus giving smaller graphics files.
## ----selection-4lda---------------------------------------------------
## Selection of features that discriminate
## ss 12.3.3: Accuracies and Scores for test data
Golub.BM <- with(golubInfo, Golub[, BM.PB=="BM"])
cancer.BM <- with(golubInfo, cancer[BM.PB=="BM"])
## Now split each of the cancer.BM categories between two subsets
## Uses divideUp(), from hddplot
gp.id <- divideUp(cancer.BM, nset=2, seed=29)
# Set seed to allow exact reproduction of the results below
table(gp.id, cancer.BM)
## ----train-test-------------------------------------------------------
accboth <- accTrainTest(x = Golub.BM, cl = cancer.BM,
traintest=gp.id, , print.progress=FALSE)
## ----plot-train-test, fig.width=7.5, fig.height=4, out.width="0.97\\textwidth", echo=FALSE----
opar <- par(mar=c(4,4,3.1,.1))
## Use function plotTrainTest() from hddplot
plotTrainTest(x=Golub.BM, nfeatures=c(14,10), cl=cancer.BM, traintest=gp.id)
par(opar)
## ----plot-train-test, eval=FALSE--------------------------------------
# opar <- par(mar=c(4,4,3.1,.1))
# ## Use function plotTrainTest() from hddplot
# plotTrainTest(x=Golub.BM, nfeatures=c(14,10), cl=cancer.BM, traintest=gp.id)
# par(opar)
## ----match------------------------------------------------------------
rbind(accboth$sub1.2[1:20],accboth$sub2.1[1:20])
match(accboth$sub1.2[1:20],accboth$sub2.1[1:20])
## ----opt-tissue-mfB-cv, warning=FALSE---------------------------------
## Cross-validation to determine the optimum number of features
## Accuracy measure will be: tissue.mfB.cv$acc.cv
tissue.mfB.cv <- cvdisc(GolubB, cl=tissue.mfB, nfeatures=1:23,
nfold=c(5,1), print.progress=FALSE)
# 5-fold CV (x1)
## Defective measures will be in acc.resub (resubstitution)
## and acc.sel1 (select features prior to cross-validation)
tissue.mfB.badcv <- defectiveCVdisc(GolubB, cl=tissue.mfB,
foldids=tissue.mfB.cv$folds,
nfeatures=1:23, nfold=c(5,1),
print.progress=FALSE)
## NB: Warning messages have been omitted
## ----tissue-random, warning=FALSE-------------------------------------
## Calculations for random normal data:
set.seed(43)
rGolubB <- matrix(rnorm(prod(dim(GolubB))), nrow=dim(GolubB)[1])
rtissue.mfB.cv <- cvdisc(rGolubB, cl=tissue.mfB, nfeatures=1:23,
nfold=c(5,1), print.progress=FALSE)
rtissue.mfB.badcv <- defectiveCVdisc(rGolubB, cl=tissue.mfB,
nfeatures=1:23, nfold=c(5,1),
foldids=rtissue.mfB.cv$folds,
print.progress=FALSE)
## ----plot-acc---------------------------------------------------------
## This function will be used for the plots
plot.acc <- function(cv=cv1, badcv=badcv1, nseq=NULL, badnseq=NULL,
title="", ylab="Predictive accuracy",
add.legend=TRUE){
maxg <- min(c(length(badcv$acc.resub), length(cv$acc.cv)))
if(is.null(nseq))nseq <- 1:maxg
plot(nseq, badcv$acc.resub[1:maxg], ylim=c(0,1), type="n",
yaxs="i", xlab="Number of features selected", ylab=ylab)
par(xpd=T)
points(nseq, badcv$acc.resub[1:maxg], col=2, type="b", lty=2,
pch=0, cex=0.8)
par(xpd=FALSE)
points(nseq, badcv$acc.sel1[1:maxg], col="gray40", pch=3, cex=0.8)
lines(lowess(nseq, badcv$acc.sel1[1:maxg], f=.325, iter=0),
col="gray40", lty=2)
points(nseq, cv$acc.cv[1:maxg], col="blue", pch=1, cex=0.8)
lines(lowess(nseq, cv$acc.cv[1:maxg], f=.325, iter=0), col="blue",
lwd=2)
xy <- par()$usr[c(1,3)]
if(add.legend)
legend(xy[1], xy[2], xjust=0, yjust=0,
legend=c("Training set 'accuracy'",
"Defective cross-validation",
"Cross-validation - select at each fold"),
lty=c(1,2,1), lwd=c(1,1,2), pch=c(0,3,1),
col=c("red","gray40","blue"), cex=0.875)
mtext(side=3,line=0.35, title, adj=0)
}
## ----cv-bad-----------------------------------------------------------
plot.acc(tissue.mfB.cv, tissue.mfB.badcv,
title="A: Golub data (as for Figure 12.9)")
plot.acc(rtissue.mfB.cv, rtissue.mfB.badcv, ylab="",
title="B: Random data", add.legend=FALSE)
## ----which------------------------------------------------------------
## Which features?
genelist <- matrix(tissue.mfB.cv$genelist[1:3, ,], nrow=3)
tab <- table(genelist, row(genelist))
ord <- order(tab[,1], tab[,2], decreasing=TRUE)
tab[ord,]
## ----cv-BMsamples-----------------------------------------------------
## Cross-validation: bone marrow ({BM}) samples only
BMonly.cv <- cvdisc(Golub.BM, cl=cancer.BM, nfeatures=1:25,
nfold=c(5,1), print.progress=FALSE)
tissue.mfB.scores <-
cvscores(cvlist = tissue.mfB.cv, nfeatures = 3, cl.other = NULL,
print.progress=FALSE)
BMonly.scores <- cvscores(cvlist=BMonly.cv, nfeatures=19,
cl.other=NULL, print.progress=FALSE)
## ----cv-Bcell-gphAB, echo=FALSE---------------------------------------
opar <- par(mar=c(4,4,2.6,.1))
## Panel A: Uses tissue.mfB.acc from above
scoreplot(scorelist = tissue.mfB.scores, cl.circle=NULL,
prefix="B-cell subset -")
## Panel B; classify bone marrow samples a/c cancer type.
scoreplot(scorelist=BMonly.scores, cl.circle=tissue.mfB,
circle=tissue.mfB%in%c("BM:f","BM:m"),
params=list(circle=list(col=c("cyan","gray"))),
prefix="B: BM samples -")
par(opar)
## ----cv-Bcell-gphAB, eval=FALSE---------------------------------------
# opar <- par(mar=c(4,4,2.6,.1))
# ## Panel A: Uses tissue.mfB.acc from above
# scoreplot(scorelist = tissue.mfB.scores, cl.circle=NULL,
# prefix="B-cell subset -")
# ## Panel B; classify bone marrow samples a/c cancer type.
# scoreplot(scorelist=BMonly.scores, cl.circle=tissue.mfB,
# circle=tissue.mfB%in%c("BM:f","BM:m"),
# params=list(circle=list(col=c("cyan","gray"))),
# prefix="B: BM samples -")
# par(opar)
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.