sda-khan-data.R
In sda: Shrinkage Discriminant Analysis and CAT Score Variable Selection

# /*
# This is an R script containing R markdown comments.  It can be run as is in R.
# To generate a document containing the formatted R code, R output and markdown 
# click the "Compile Notebook" button in R Studio, or run the command
# rmarkdown::render() - see http://rmarkdown.rstudio.com/r_notebook_format.html
# */


#' ---
#' title: "Analysis of SRBCT Data"
#' output: pdf_document
#' author: ""
#' date: Requires "sda" in version 1.3.2 (January 2014) or later
#' ---

#'
#' # Load "sda" package and create SRBCT data set

library("sda")

#' Load data set from Khan et al. (2001):
data(khan2001)

#' Create data set containing only the SRBCT samples:
del.idx = which( khan2001$y == "non-SRBCT" )
srbct.x = khan2001$x[-del.idx,]
srbct.y = factor(khan2001$y[-del.idx])
dim(srbct.x)
 
#' Four subtypes of cancer:
levels(srbct.y)

#' Divide into training and test data
Xtrain = srbct.x[1:63,]
Ytrain = srbct.y[1:63]
Xtest = srbct.x[64:83,]
Ytest = srbct.y[64:83]


#' \newpage
#'
#' # Diagonal Discriminant Analysis (DDA) 

#' In DDA correlation among predictors is assumed to be zero, i.e. a diagonal 
#' covariance matrix is used.
 
 
#'
#' ## Step 1 - feature ranking

#' As there are more than two groups in the response there are three different
#' ways to obtain a summary test statistic to rank genes:

#' a) ranking by averaged squared t-scores across the four groups
#+ fig.height=5
ra = sda.ranking(Xtrain, Ytrain, fdr=TRUE, plot.fdr=TRUE, diagonal=TRUE, ranking.score="avg")
sum( ra[, "lfdr"]< 0.80)  # 97 genes included in classifier (by FNDR control)
which.max( ra[, "HC"] )  #  145 genes according to HC criterion

#' b) ranking by maximum of squared t-scores across the four groups
#+ fig.height=5
ra = sda.ranking(Xtrain, Ytrain, fdr=TRUE, plot.fdr=TRUE, diagonal=TRUE, ranking.score="max")
sum( ra[, "lfdr"]< 0.80)  # 78 genes included in classifier (by FNDR control)
which.max( ra[, "HC"] )  #  121 genes according to HC criterion

#' c) ranking by mutual information (weighted sum of squared t-scores)
#+ fig.height=5
ra = sda.ranking(Xtrain, Ytrain, fdr=TRUE, plot.fdr=TRUE, diagonal=TRUE, ranking.score="entropy")
sum( ra[, "lfdr"]< 0.80)  # 99 genes included in classifier (by FNDR control)
which.max( ra[, "HC"] )  #  158 genes according to HC criterion


#' here we pick the top 99 genes of option c)
#+ fig.height=8
plot(ra, top=99, main="The 99 Top Ranking Genes", ylab="Gene ID")

#' Select these 99 variables:
idx = ra[1:99,"idx"]
Xtrain2 = Xtrain[,idx]
Xtest2 = Xtest[,idx]

 
#'
#' ## Step 2 - training the classifier

#' Learn DDA predictor:
sda.fit = sda(Xtrain2, Ytrain, diagonal=TRUE)

#'
#' ## Step 3 - prediction

#' Predict class labels from test data and compare with known labels:
dim(Xtest2)
predict(sda.fit, Xtest2) 
ynew = predict(sda.fit, Xtest2)$class

#' Number of missclassified test samples:
sum(ynew != Ytest)
# /* 1 */


#' \newpage
#'
#' # Linear Discriminant Analysis (LDA) 

#' In LDA correlation among predictors is taken into account.
 
#'
#' ## Step 1 - feature ranking

#' As there are more than two groups in the response there are three different
#' ways to obtain a summary test statistic to rank genes:
 
#' a) ranking by averaged squared cat-scores across the four groups
#+ fig.height=5
ra = sda.ranking(Xtrain, Ytrain, fdr=TRUE, plot.fdr=TRUE, ranking.score="avg")
sum( ra[, "lfdr"]< 0.80)  # 93 genes included in classifier (by FNDR control)
which.max( ra[, "HC"] )  #  143 genes according to HC criterion
 
#' b) ranking by maximum of squared cat-scores across the four groups
#+ fig.height=5
ra = sda.ranking(Xtrain, Ytrain, fdr=TRUE, plot.fdr=TRUE, ranking.score="max")
sum( ra[, "lfdr"]< 0.80)  # 156 genes included in classifier (by FNDR control)
which.max( ra[, "HC"] )  #  194 genes according to HC criterion
 
#' c) ranking by mutual information (weighted sum of squared cat-scores)
#+ fig.height=5
ra = sda.ranking(Xtrain, Ytrain, fdr=TRUE, plot.fdr=TRUE, ranking.score="entropy")
sum( ra[, "lfdr"]< 0.80)  # 97 genes included in classifier (by FNDR control)
which.max( ra[, "HC"] )  #  140 genes according to HC criterion
 
#' here we pick the top 97 genes of option c)
#+ fig.height=8
plot(ra, top=97, main="The 97 Top Ranking Genes", ylab="Gene ID")

#' Select these 97 variables:
idx = ra[1:97,"idx"]
Xtrain2 = Xtrain[,idx]
Xtest2 = Xtest[,idx]
 
#'
#' ## Step 2 - training the classifier

#' Learn LDA predictor:
sda.fit = sda(Xtrain2, Ytrain)

#'
#' ## Step 3 - prediction
 
#' Predict class labels from test data and compare with known labels:
dim(Xtest2)
predict(sda.fit, Xtest2)
ynew = predict(sda.fit, Xtest2)$class

#' Number of missclassified test samples:
sum(ynew != Ytest) 
# /* 0 */

 
#' \newpage
#'
#' # Estimate prediction accuracy using crossvalidation 
 
#' Using crossvalidation we can estimate the prediction error
#' from the training data set alone.
 
library("crossval")
  
 
#' Setup prediction function: estimate the accuracy of a predictor with a fixed number of predictors (note
#' this takes into account the uncertainty in estimating the variable ordering).
predfun = function(Xtrain, Ytrain, Xtest, Ytest, numVars, diagonal=FALSE,
                    ranking.score="entropy")
{ 
  # estimate ranking and determine the best numVars variables
  ra = sda.ranking(Xtrain, Ytrain, verbose=FALSE, diagonal=diagonal, 
                   fdr=FALSE, ranking.score=ranking.score)
  selVars = ra[,"idx"][1:numVars]
   
  # fit and predict
  sda.out = sda(Xtrain[, selVars, drop=FALSE], Ytrain, diagonal=diagonal,
                verbose=FALSE)
  ynew = predict(sda.out, Xtest[, selVars, drop=FALSE], verbose=FALSE)$class 
   
  # compute accuracy
  acc = mean(Ytest == ynew)
   
  return(acc)  
}
 
#' Our setup for crossvalidation:
K = 10 # number of folds
B = 20 # number of repetitions
 
 
#'  Crossvalidation estimate of accuracy for
#'  LDA using the top 100 features ranked by CAT scores
#'  (combined across groups using "entropy" for overall ranking):
set.seed(12345)
cv.lda100 = crossval(predfun, Xtrain, Ytrain, K=K, B=B, numVars=100, 
                     diagonal=FALSE, verbose=FALSE)
cv.lda100$stat
# /* 1 */
 
 
#'
#' ## Comparison of LDA / DDA and "entropy" and "max" options
 
#' LDA using the top 10 features ranked by CAT scores
#' (combined across groups using "entropy" for overall ranking):
set.seed(12345)
cv.lda10 = crossval(predfun, Xtrain, Ytrain, K=K, B=B, numVars=10, 
                     diagonal=FALSE, verbose=FALSE)
cv.lda10$stat
# /* 0.9909762 */
 
#' DDA using the top 10 features ranked by t scores
#' (combined across groups using "entropy" for overall ranking):
set.seed(12345)
cv.dda10 = crossval(predfun, Xtrain, Ytrain, K=K, B=B, numVars=10, 
                     diagonal=TRUE, verbose=FALSE)
cv.dda10$stat
# /* 0.9643869 */
 
#' DDA using the top 10 features ranked by t scores,
#' (combined across groups using "max" for overall ranking, as in PAM):
set.seed(12345)
cv.dda10b = crossval(predfun, Xtrain, Ytrain, K=K, B=B, numVars=10, 
                      diagonal=TRUE, ranking.score="max", verbose=FALSE)
cv.dda10b$stat
# /* 0.9585595 */
 
#' **Conclusions**:
#' 
#' 1. LDA/CAT score ranking performs petter than DDA/t-score ranking.
#' 2. "entropy" is better as group summary than "max".

Any scripts or data that you put into this service are public.

sda documentation built on April 11, 2025, 6:16 p.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

sda
Shrinkage Discriminant Analysis and CAT Score Variable Selection

inst/doc/sda-khan-data.R
In sda: Shrinkage Discriminant Analysis and CAT Score Variable Selection

Try the sda package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

sda Shrinkage Discriminant Analysis and CAT Score Variable Selection

inst/doc/sda-khan-data.R In sda: Shrinkage Discriminant Analysis and CAT Score Variable Selection

Try the sda package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

sda
Shrinkage Discriminant Analysis and CAT Score Variable Selection

inst/doc/sda-khan-data.R
In sda: Shrinkage Discriminant Analysis and CAT Score Variable Selection