MLSeq.R
In MLSeq: Machine Learning Interface for RNA-Seq Data

## ----style-knitr, eval=TRUE, echo=FALSE, results='asis'-----------------------
  BiocStyle::latex(use.unsrturl = FALSE, width = 80)

## ----knitr_options, echo=FALSE, results="hide", warning=FALSE-----------------
library(knitr)
opts_chunk$set(tidy = FALSE, dev = "pdf", fig.show = "hide", message = FALSE, fig.align = "center", cache = FALSE)

## ----load_packages, echo=FALSE, results="hide", warning=FALSE-----------------
library(MLSeq)
library(DESeq2)
library(edgeR)
library(VennDiagram)
library(pamr)

## ----eval = FALSE-------------------------------------------------------------
#  library(MLSeq)

## ----file_path_cervical-------------------------------------------------------
filepath <- system.file("extdata/cervical.txt", package = "MLSeq")

## ----read_cervical_data-------------------------------------------------------
cervical <- read.table(filepath, header=TRUE)

## ----head_cervical------------------------------------------------------------
head(cervical[ ,1:10]) # Mapped counts for first 6 features of 10 subjects.

## ----define_class_labels------------------------------------------------------
class <- DataFrame(condition = factor(rep(c("N","T"), c(29, 29))))
class

## ----data_splitting-----------------------------------------------------------
library(DESeq2)

set.seed(2128)

# We do not perform a differential expression analysis to select differentially
# expressed genes. However, in practice, DE analysis might be performed before
# fitting classifiers. Here, we selected top 100 features having the highest
# gene-wise variances in order to decrease computational cost.
vars <- sort(apply(cervical, 1, var, na.rm = TRUE), decreasing = TRUE)
data <- cervical[names(vars)[1:100], ]
nTest <- ceiling(ncol(data) * 0.3)
ind <- sample(ncol(data), nTest, FALSE)

# Minimum count is set to 1 in order to prevent 0 division problem within
# classification models.
data.train <- as.matrix(data[ ,-ind] + 1)
data.test <- as.matrix(data[ ,ind] + 1)
classtr <- DataFrame(condition = class[-ind, ])
classts <- DataFrame(condition = class[ind, ])

## ----DESeqDataSets------------------------------------------------------------
data.trainS4 = DESeqDataSetFromMatrix(countData = data.train, colData = classtr,
                                      design = formula(~condition))
data.testS4 = DESeqDataSetFromMatrix(countData = data.test, colData = classts,
                                     design = formula(~condition))

## ----mwe_limitations_on_continuous_classifiers, eval = FALSE, message=FALSE----
#  # Support Vector Machines with Radial Kernel
#  fit <- classify(data = data.trainS4, method = "svmRadial",
#                   preProcessing = "deseq-rlog", ref = "T",
#                   control = trainControl(method = "repeatedcv", number = 2,
#                                          repeats = 2, classProbs = TRUE))
#  show(fit)

## ----eval = FALSE, echo = TRUE------------------------------------------------
#  set.seed(2128)
#  
#  # Voom based Nearest Shrunken Centroids.
#  fit <- classify(data = data.trainS4, method = "voomNSC",
#                   normalize = "deseq", ref = "T",
#                   control = voomControl(tuneLength = 20))
#  
#  trained(fit)  ## Trained model summary

## ----Optimizing_model_parameters_example, eval = TRUE, echo = TRUE------------
set.seed(2128)

# Support vector machines with radial basis function kernel
fit.svm <- classify(data = data.trainS4, method = "svmRadial",
                 preProcessing = "deseq-vst", ref = "T", tuneLength = 10,
                 control = trainControl(method = "repeatedcv", number = 5,
                                        repeats = 10, classProbs = TRUE))

show(fit.svm)

## ----fitted_model_svm---------------------------------------------------------
trained(fit.svm)

## ----eval = FALSE-------------------------------------------------------------
#  plot(fit.svm)

## ----fitted_model_svm_figure, echo = FALSE, results='hide'--------------------
cairo_pdf(filename = "fitted_model_svm_figure.pdf", height = 5.5)
plot(fit.svm)
dev.off()

## ----control_svm_model_example, eval = FALSE----------------------------------
#  # Define control list
#  ctrl.svm <- trainControl(method = "repeatedcv", number = 5, repeats = 1)
#  ctrl.plda <- discreteControl(method = "repeatedcv", number = 5, repeats = 1,
#                               tuneLength = 10)
#  ctrl.voomDLDA <- voomControl(method = "repeatedcv", number = 5, repeats = 1,
#                               tuneLength = 10)
#  
#  # Support vector machines with radial basis function kernel
#  fit.svm <- classify(data = data.trainS4, method = "svmRadial",
#                   preProcessing = "deseq-vst", ref = "T", tuneLength = 10,
#                   control = ctrl.svm)
#  
#  # Poisson linear discriminant analysis
#  fit.plda <- classify(data = data.trainS4, method = "PLDA", normalize = "deseq",
#                       ref = "T", control = ctrl.plda)
#  
#  # Voom-based diagonal linear discriminant analysis
#  fit.voomDLDA <- classify(data = data.trainS4, method = "voomDLDA",
#                           normalize = "deseq", ref = "T", control = ctrl.voomDLDA)

## ----echo = FALSE-------------------------------------------------------------
# Define control list
ctrl.voomDLDA <- voomControl(method = "repeatedcv", number = 5, repeats = 1,
                             tuneLength = 10)

# Voom-based diagonal linear discriminant analysis
fit.voomDLDA <- classify(data = data.trainS4, method = "voomDLDA",
                         normalize = "deseq", ref = "T", control = ctrl.voomDLDA)

## -----------------------------------------------------------------------------
trained(fit.voomDLDA)

## -----------------------------------------------------------------------------
#Predicted class labels
pred.svm <- predict(fit.svm, data.testS4)
pred.svm

## -----------------------------------------------------------------------------
pred.svm <- relevel(pred.svm, ref = "T")
actual <- relevel(classts$condition, ref = "T")

tbl <- table(Predicted = pred.svm, Actual = actual)
confusionMatrix(tbl, positive = "T")

## ----results='hide', message=FALSE--------------------------------------------
set.seed(2128)

# Define control lists.
ctrl.continuous <- trainControl(method = "repeatedcv", number = 5, repeats = 10)
ctrl.discrete <- discreteControl(method = "repeatedcv", number = 5, repeats = 10,
                             tuneLength = 10)
ctrl.voom <- voomControl(method = "repeatedcv", number = 5, repeats = 10,
                             tuneLength = 10)

# 1. Continuous classifiers, SVM and NSC
fit.svm <- classify(data = data.trainS4, method = "svmRadial",
                 preProcessing = "deseq-vst", ref = "T", tuneLength = 10,
                 control = ctrl.continuous)

fit.NSC <- classify(data = data.trainS4, method = "pam",
                 preProcessing = "deseq-vst", ref = "T", tuneLength = 10,
                 control = ctrl.continuous)

# 2. Discrete classifiers
fit.plda <- classify(data = data.trainS4, method = "PLDA", normalize = "deseq",
                     ref = "T", control = ctrl.discrete)

fit.plda2 <- classify(data = data.trainS4, method = "PLDA2", normalize = "deseq",
                     ref = "T", control = ctrl.discrete)

fit.nblda <- classify(data = data.trainS4, method = "NBLDA", normalize = "deseq",
                     ref = "T", control = ctrl.discrete)

# 3. voom-based classifiers
fit.voomDLDA <- classify(data = data.trainS4, method = "voomDLDA",
                         normalize = "deseq", ref = "T", control = ctrl.voom)

fit.voomNSC <- classify(data = data.trainS4, method = "voomNSC",
                         normalize = "deseq", ref = "T", control = ctrl.voom)

# 4. Predictions
pred.svm <- predict(fit.svm, data.testS4)
pred.NSC <- predict(fit.NSC, data.testS4)
# ... truncated

## ----echo = FALSE, results='asis', message=FALSE------------------------------
library(xtable)

pred.svm <- predict(fit.svm, data.testS4)
pred.NSC <- predict(fit.NSC, data.testS4)
pred.plda <- predict(fit.plda, data.testS4)
pred.nblda <- predict(fit.nblda, data.testS4)
pred.voomDLDA <- predict(fit.voomDLDA, data.testS4)
pred.voomNSC <- predict(fit.voomNSC, data.testS4)

actual <- data.testS4$condition
nn <- length(actual)
diag.svm <- sum(diag(table(pred.svm, actual)))
diag.NSC <- sum(diag(table(pred.NSC, actual)))
diag.plda <- sum(diag(table(pred.plda, actual)))
diag.nblda <- sum(diag(table(pred.nblda, actual)))
diag.voomDLDA <- sum(diag(table(pred.voomDLDA, actual)))
diag.voomNSC <- sum(diag(table(pred.voomNSC, actual)))

acc <- c(diag.svm, diag.NSC, diag.plda, diag.nblda, diag.voomDLDA, diag.voomNSC) / nn
sparsity <- c(NA, trained(fit.NSC)$finalModel$nonzero/nrow(data.testS4),
              length(selectedGenes(fit.plda))/nrow(data.testS4), NA, NA,
              length(selectedGenes(fit.voomNSC))/nrow(data.testS4))

tbl <- data.frame(Classifier = c("SVM", "NSC", "PLDA (Transformed)", "NBLDA", "voomDLDA", "voomNSC"), Accuracy = acc, Sparsity = sparsity)

xtbl <- xtable(tbl, caption = "Classification results for cervical data.", label = "tbl:accRes", align = "lp{4cm}p{2cm}c")

digits(xtbl) <- c(0, 0, 3, 3)
print.xtable(xtbl, caption.placement = "top", include.rownames = FALSE, booktabs = TRUE)

## ----echo = FALSE-------------------------------------------------------------
best_in_accuracy <- as.character(tbl$Classifier[which(acc == max(acc, na.rm = TRUE))])
best_in_acc_text <- paste("\\textbf{", best_in_accuracy, "}", sep = "")

if (length(best_in_accuracy) >= 2){
  best_in_acc_text <- paste(paste(best_in_acc_text[-length(best_in_acc_text)], collapse = ", "), best_in_acc_text[length(best_in_acc_text)], sep = " and ")
}

best_in_sparsity <- as.character(tbl$Classifier[which(sparsity == min(sparsity, na.rm = TRUE))])
best_in_sparsity_text <- paste("\\textbf{", best_in_sparsity, "}", sep = "")

if (length(best_in_sparsity) >= 2){
  best_in_sparsity_text <- paste(paste(best_in_sparsity_text[-length(best_in_sparsity_text)], collapse = ", "), best_in_sparsity_text[length(best_in_sparsity_text)], sep = " and ")
}

## -----------------------------------------------------------------------------
selectedGenes(fit.voomNSC)

## ----all_common_features, echo = FALSE----------------------------------------
pam.final <- trained(fit.NSC)$finalModel   ## 'pamrtrained' object.
geneIdx <- pamr:::pamr.predict(pam.final, pam.final$xData, threshold = pam.final$threshold, type = "nonzero")

genes.pam <- colnames(pam.final$xData)[geneIdx]
genes.plda <- selectedGenes(fit.plda)
genes.plda2 <- selectedGenes(fit.plda2)
genes.vnsc <- selectedGenes(fit.voomNSC)

tmp.list <- list(genes.pam, genes.plda, genes.plda2, genes.vnsc)

nn <- c(length(genes.pam), length(genes.plda), length(genes.plda2), length(genes.vnsc))
ooo <- order(nn, decreasing = TRUE)

tmp.list <- tmp.list[ooo]

common <- tmp.list[[1]]
for (i in 2:(length(tmp.list))){
  tmp2 <- tmp.list[[i]]
  tmp <- common[common %in% tmp2]
  common <- tmp
}

## ----venn_diagram, echo = FALSE-----------------------------------------------
venn.plot <- venn.diagram(
  x = list(voomNSC = genes.vnsc, NSC = genes.pam, PLDA = genes.plda, PLDA2 = genes.plda2),
  height = 1200, width = 1200,
  resolution = 200,
  filename = "Selected_features.png", imagetype = "png",
  col = "black",
  fill = c("khaki1", "skyblue", "tomato3", "darkolivegreen3"),
  alpha = 0.50,
  cat.cex = 1.2,
  cex = 1.5,
  cat.fontface = "bold"
)

## -----------------------------------------------------------------------------
set.seed(2128)

ctrl <- discreteControl(method = "repeatedcv", number = 5, repeats = 2,
                        tuneLength = 10)

# PLDA without power transformation
fit <- classify(data = data.trainS4, method = "PLDA", normalize = "deseq",
                ref = "T", control = ctrl)
show(fit)

## -----------------------------------------------------------------------------
method(fit) <- "PLDA2"
show(fit)

## -----------------------------------------------------------------------------
ref(fit) <- "N"
normalization(fit) <- "TMM"
metaData(fit)

## -----------------------------------------------------------------------------
fit <- update(fit)
show(fit)

## ----echo = TRUE, message=FALSE, error=FALSE, eval = FALSE--------------------
#  method(fit) <- "rpart"
#  update(fit)

## ----echo = FALSE, message=FALSE, error=TRUE----------------------------------
method(fit) <- "rpart"
tmp <- try(update(fit))

## -----------------------------------------------------------------------------
control(fit) <- trainControl(method = "repeatedcv", number = 5, repeats = 2)

# 'normalize' is not valid for continuous classifiers. We use 'preProcessing'
# rather than 'normalize'.
preProcessing(fit) <- "tmm-logcpm"

fit <- update(fit)
show(fit)

## ----session_info-------------------------------------------------------------
sessionInfo()