inst/doc/sda-khan-data.R

# /*
# This is an R script containing R markdown comments.  It can be run as is in R.
# To generate a document containing the formatted R code, R output and markdown 
# click the "Compile Notebook" button in R Studio, or run the command
# rmarkdown::render() - see http://rmarkdown.rstudio.com/r_notebook_format.html
# */


#' ---
#' title: "Analysis of SRBCT Data"
#' output: pdf_document
#' author: ""
#' date: Requires "sda" in version 1.3.2 (January 2014) or later
#' ---

#'
#' # Load "sda" package and create SRBCT data set

library("sda")

#' Load data set from Khan et al. (2001):
data(khan2001)

#' Create data set containing only the SRBCT samples:
del.idx = which( khan2001$y == "non-SRBCT" )
srbct.x = khan2001$x[-del.idx,]
srbct.y = factor(khan2001$y[-del.idx])
dim(srbct.x)
 
#' Four subtypes of cancer:
levels(srbct.y)

#' Divide into training and test data
Xtrain = srbct.x[1:63,]
Ytrain = srbct.y[1:63]
Xtest = srbct.x[64:83,]
Ytest = srbct.y[64:83]


#' \newpage
#'
#' # Diagonal Discriminant Analysis (DDA) 

#' In DDA correlation among predictors is assumed to be zero, i.e. a diagonal 
#' covariance matrix is used.
 
 
#'
#' ## Step 1 - feature ranking

#' As there are more than two groups in the response there are three different
#' ways to obtain a summary test statistic to rank genes:

#' a) ranking by averaged squared t-scores across the four groups
#+ fig.height=5
ra = sda.ranking(Xtrain, Ytrain, fdr=TRUE, plot.fdr=TRUE, diagonal=TRUE, ranking.score="avg")
sum( ra[, "lfdr"]< 0.80)  # 97 genes included in classifier (by FNDR control)
which.max( ra[, "HC"] )  #  145 genes according to HC criterion

#' b) ranking by maximum of squared t-scores across the four groups
#+ fig.height=5
ra = sda.ranking(Xtrain, Ytrain, fdr=TRUE, plot.fdr=TRUE, diagonal=TRUE, ranking.score="max")
sum( ra[, "lfdr"]< 0.80)  # 78 genes included in classifier (by FNDR control)
which.max( ra[, "HC"] )  #  121 genes according to HC criterion

#' c) ranking by mutual information (weighted sum of squared t-scores)
#+ fig.height=5
ra = sda.ranking(Xtrain, Ytrain, fdr=TRUE, plot.fdr=TRUE, diagonal=TRUE, ranking.score="entropy")
sum( ra[, "lfdr"]< 0.80)  # 99 genes included in classifier (by FNDR control)
which.max( ra[, "HC"] )  #  158 genes according to HC criterion


#' here we pick the top 99 genes of option c)
#+ fig.height=8
plot(ra, top=99, main="The 99 Top Ranking Genes", ylab="Gene ID")

#' Select these 99 variables:
idx = ra[1:99,"idx"]
Xtrain2 = Xtrain[,idx]
Xtest2 = Xtest[,idx]

 
#'
#' ## Step 2 - training the classifier

#' Learn DDA predictor:
sda.fit = sda(Xtrain2, Ytrain, diagonal=TRUE)

#'
#' ## Step 3 - prediction

#' Predict class labels from test data and compare with known labels:
dim(Xtest2)
predict(sda.fit, Xtest2) 
ynew = predict(sda.fit, Xtest2)$class

#' Number of missclassified test samples:
sum(ynew != Ytest)
# /* 1 */


#' \newpage
#'
#' # Linear Discriminant Analysis (LDA) 

#' In LDA correlation among predictors is taken into account.
 
#'
#' ## Step 1 - feature ranking

#' As there are more than two groups in the response there are three different
#' ways to obtain a summary test statistic to rank genes:
 
#' a) ranking by averaged squared cat-scores across the four groups
#+ fig.height=5
ra = sda.ranking(Xtrain, Ytrain, fdr=TRUE, plot.fdr=TRUE, ranking.score="avg")
sum( ra[, "lfdr"]< 0.80)  # 93 genes included in classifier (by FNDR control)
which.max( ra[, "HC"] )  #  143 genes according to HC criterion
 
#' b) ranking by maximum of squared cat-scores across the four groups
#+ fig.height=5
ra = sda.ranking(Xtrain, Ytrain, fdr=TRUE, plot.fdr=TRUE, ranking.score="max")
sum( ra[, "lfdr"]< 0.80)  # 156 genes included in classifier (by FNDR control)
which.max( ra[, "HC"] )  #  194 genes according to HC criterion
 
#' c) ranking by mutual information (weighted sum of squared cat-scores)
#+ fig.height=5
ra = sda.ranking(Xtrain, Ytrain, fdr=TRUE, plot.fdr=TRUE, ranking.score="entropy")
sum( ra[, "lfdr"]< 0.80)  # 97 genes included in classifier (by FNDR control)
which.max( ra[, "HC"] )  #  140 genes according to HC criterion
 
#' here we pick the top 97 genes of option c)
#+ fig.height=8
plot(ra, top=97, main="The 97 Top Ranking Genes", ylab="Gene ID")

#' Select these 97 variables:
idx = ra[1:97,"idx"]
Xtrain2 = Xtrain[,idx]
Xtest2 = Xtest[,idx]
 
#'
#' ## Step 2 - training the classifier

#' Learn LDA predictor:
sda.fit = sda(Xtrain2, Ytrain)

#'
#' ## Step 3 - prediction
 
#' Predict class labels from test data and compare with known labels:
dim(Xtest2)
predict(sda.fit, Xtest2)
ynew = predict(sda.fit, Xtest2)$class

#' Number of missclassified test samples:
sum(ynew != Ytest) 
# /* 0 */

 
#' \newpage
#'
#' # Estimate prediction accuracy using crossvalidation 
 
#' Using crossvalidation we can estimate the prediction error
#' from the training data set alone.
 
library("crossval")
  
 
#' Setup prediction function: estimate the accuracy of a predictor with a fixed number of predictors (note
#' this takes into account the uncertainty in estimating the variable ordering).
predfun = function(Xtrain, Ytrain, Xtest, Ytest, numVars, diagonal=FALSE,
                    ranking.score="entropy")
{ 
  # estimate ranking and determine the best numVars variables
  ra = sda.ranking(Xtrain, Ytrain, verbose=FALSE, diagonal=diagonal, 
                   fdr=FALSE, ranking.score=ranking.score)
  selVars = ra[,"idx"][1:numVars]
   
  # fit and predict
  sda.out = sda(Xtrain[, selVars, drop=FALSE], Ytrain, diagonal=diagonal,
                verbose=FALSE)
  ynew = predict(sda.out, Xtest[, selVars, drop=FALSE], verbose=FALSE)$class 
   
  # compute accuracy
  acc = mean(Ytest == ynew)
   
  return(acc)  
}
 
#' Our setup for crossvalidation:
K = 10 # number of folds
B = 20 # number of repetitions
 
 
#'  Crossvalidation estimate of accuracy for
#'  LDA using the top 100 features ranked by CAT scores
#'  (combined across groups using "entropy" for overall ranking):
set.seed(12345)
cv.lda100 = crossval(predfun, Xtrain, Ytrain, K=K, B=B, numVars=100, 
                     diagonal=FALSE, verbose=FALSE)
cv.lda100$stat
# /* 1 */
 
 
#'
#' ## Comparison of LDA / DDA and "entropy" and "max" options
 
#' LDA using the top 10 features ranked by CAT scores
#' (combined across groups using "entropy" for overall ranking):
set.seed(12345)
cv.lda10 = crossval(predfun, Xtrain, Ytrain, K=K, B=B, numVars=10, 
                     diagonal=FALSE, verbose=FALSE)
cv.lda10$stat
# /* 0.9909762 */
 
#' DDA using the top 10 features ranked by t scores
#' (combined across groups using "entropy" for overall ranking):
set.seed(12345)
cv.dda10 = crossval(predfun, Xtrain, Ytrain, K=K, B=B, numVars=10, 
                     diagonal=TRUE, verbose=FALSE)
cv.dda10$stat
# /* 0.9643869 */
 
#' DDA using the top 10 features ranked by t scores,
#' (combined across groups using "max" for overall ranking, as in PAM):
set.seed(12345)
cv.dda10b = crossval(predfun, Xtrain, Ytrain, K=K, B=B, numVars=10, 
                      diagonal=TRUE, ranking.score="max", verbose=FALSE)
cv.dda10b$stat
# /* 0.9585595 */
 
#' **Conclusions**:
#' 
#' 1. LDA/CAT score ranking performs petter than DDA/t-score ranking.
#' 2. "entropy" is better as group summary than "max".

Try the sda package in your browser

Any scripts or data that you put into this service are public.

sda documentation built on May 29, 2017, 5:29 p.m.