##############
#This file should be run in the second.
#This file takes input the 'gene.go.mat' RData file generated by the 'molecule_go_map.R' file.
#In a separate drectory, store the 'molecule_classification.R' file and also create a folder name as 'object' in the current directory and store the 'gene.go.mat' RData file here.
#This R file contains the code for the random forest classification of the pathway molecules GO matrix.
#First we will partition the matrix data for train and test types.
#Next we will build the classification model with the train data and then test the test data.
#!!!Please note that building the classification model with large number of features (here GO IDs) will take several hours.
#After that we will tune the classification model to set the optimum parameters for the model.
#After tuning we will again build the model with optimum parameters and test the test data.
#Next we will collect the top 50 features (GO terms) from the model, and then build the model again with only these 50 features.
#!!!Here, building the classification model with only 50 features will not take much time.
#Then we will test the test data several times with no seed values.
#Each time hopefully we will get better accracy results.
#Finally we will save the top 50 features in the result directory of the current directory to use in the next file "molecule_classification_model.R"
##############
######################################################################################
###################### For Random Forest Classification ##############################
##Read Data
load("object/genes.go.mat.RData")
#it is very much required by random forest (data frame) - less error prone
genes.go.df<-as.data.frame(genes.go.mat)
#making the col names legal - here ':' changed to '.' from the colnames part
names(genes.go.df) <- make.names(names(genes.go.df))
genes.go.df$type <- as.factor(genes.go.df$type)
table(genes.go.df$type)
# 1 2 3 4 5
#672 644 440 997 2456
##Data Partition, training-70% and test-30%
set.seed(222)
ind <- sample(2, nrow(genes.go.df), replace = TRUE, prob = c(0.7, 0.3))
train.genes.go.df <- genes.go.df[ind==1,]
dim(train.genes.go.df)
#[1] 3648 12677
test.genes.go.df <- genes.go.df[ind==2,]
dim(test.genes.go.df)
#[1] 1561 12677
##To look at the number of individual pathway molecules
table(train.genes.go.df$type)
# 1 2 3 4 5
#484 441 304 682 1737
table(test.genes.go.df$type)
# 1 2 3 4 5
#188 203 136 315 719
##Random Forest
#install.packages("randomForest")
library(randomForest)
set.seed(123)
#####First time to build the model
initial.rf <- randomForest(type~., data=train.genes.go.df)
#To check the confusion matrix and OOB estimate of error rate
initial.rf$confusion
print(initial.rf)
#when number of trees is 500 OOB estimate of error rate: 11.49%
#we will tune the model by resetting the parameter values and hope able to get some improved results
##Prediction & Confusion Matrix - train data
#install.packages("caret")
library(caret) ##this is for confusionMatrix()
initial.train.prediction <- predict(initial.rf, train.genes.go.df)
confusionMatrix(initial.train.prediction, train.genes.go.df$type)
#Prediction & Confusion Matrix - test data
initial.test.prediction <- predict(initial.rf, test.genes.go.df)
confusionMatrix(initial.test.prediction, test.genes.go.df$type)
##the overall accuracy for train data is 0.9564 and for test data is 0.884
#Error rate of Random Forest, from here we will get the 'ntreeTry' parameter value
plot(initial.rf)
#Tune mtry
tune.initial.rf <- tuneRF(train.genes.go.df[,-1], train.genes.go.df[,1],
stepFactor = 0.5,
plot = TRUE,
ntreeTry = 300,
trace = TRUE,
improve = 0.05)
#Here we can see that when ntreeTry=300 we can get lowest OOB error=11.27% for mtry=224
#####End of first time running the model
#####After tuning build and test the model again with the revised parameter values
#####Here, we can see the improvement of the model
after.tune.rf <- randomForest(type~., data=train.genes.go.df,
ntree = 300,
mtry = 224,
importance = TRUE,
proximity = TRUE)
#Now check the confusion matrix and OOB estimate of error rate again
after.tune.rf$confusion
print(after.tune.rf)
#when number of trees is 300 OOB estimate of error rate: 11.21%
#
##Prediction & Confusion Matrix - train data
#library(caret) ##this is for confusionMatrix()
after.tune.train.prediction <- predict(after.tune.rf, train.genes.go.df)
confusionMatrix(after.tune.train.prediction, train.genes.go.df$type)
#Prediction & Confusion Matrix - test data
after.tune.test.prediction <- predict(after.tune.rf, test.genes.go.df)
confusionMatrix(after.tune.test.prediction, test.genes.go.df$type)
##we see that after tuning the overall accuracy of the model is higher than the previous model
##previously the overall accuracy for train data was 0.9564 and for test data was 0.884
##after tuning the overall accuracy for train data is 0.9707 and for test data is 0.8847
##
##Now get top 50 variables / features based on MeanDecreaseGini and
##then build the model again with this top features and then test
#get the top 50 variables / features
rf.importance<-importance(after.tune.rf, type = 2)
rf.importance.rank<-rf.importance[order(rf.importance[,1], decreasing = T),]
rf.importance.rank.top50<-names(rf.importance.rank[1:50])
#
## save the top 50 features
#top.50.feature<-rf.importance.rank.top50
#save(top.50.feature, file = "result/top.50.feature.RData")
##
##get the data frame with only these top 50 features
genes.go.df.selected<-genes.go.df[,c("type", rf.importance.rank.top50)]
dim(genes.go.df.selected)
#[1] 5209 51
##
###This section for testing of number of times when no seed value is used###
##Data Partition
#set.seed(222)
ind.selected <- sample(2, nrow(genes.go.df.selected), replace = TRUE, prob = c(0.7, 0.3))
train.selected.df <- genes.go.df.selected[ind.selected==1,]
dim(train.selected.df)
#[1] 3646 51
test.selected.df <- genes.go.df.selected[ind.selected==2,]
dim(test.selected.df)
#[1] 1599 51
##
##to see the number of individual molecule in the train and test data
#table(train.selected.df$type)
# 1 2 3 4 5
#467 443 298 710 1728
#table(test.selected.df$type)
# 1 2 3 4 5
#213 193 142 287 764
##
##Now build the model and predict the test data
rf.selected <- randomForest(type~., data=train.selected.df)
rf.selected$confusion
#Prediction & Confusion Matrix - selected train data
train.selected.prediction <- predict(rf.selected, train.selected.df)
confusionMatrix(train.selected.prediction, train.selected.df$type)
#Prediction & Confusion Matrix - selected test data
test.selected.prediction <- predict(rf.selected, test.selected.df)
confusionMatrix(test.selected.prediction, test.selected.df$type)
##
###This section for testing of number of times when no seed value is used###
###!!!For top 50 variable we have got accuracy around 0.90 (0.8689 - 0.9106) all times!!!
###variable importance
##testing for importance of the variables
importance(rf.selected)
varUsed(rf.selected) ##how many times each variable is used
#to plot the importance of the features by sorting
#varImpPlot(rf.selected, sort = TRUE, main = "top10 features sorted")
###
###partial plot for the importance of each variable for each class
class.name<-c("1","2","3","4","5")
var.name<-rownames(importance(rf.selected))
#pdf("partial_plot_for_features.pdf", paper = "a4")
#pdf("partial_plot_for_features.pdf")
pdf("partial_plot_for_features.pdf", paper = "letter")
for(m in 1: length(class.name)){
#par(mfrow=c(5,2), xpd=NA)
par(mfrow=c(5,2))
for (i in 1:length(var.name)){
main.text<-paste("Partial Dependence on", var.name[i], sep = " ")
main.text.2<-paste(main.text, class.name[m], sep = " for class ")
partialPlot(rf.selected, train.selected.df, var.name[i], class.name[m],
main=main.text.2, xlab = var.name[i])
}
}
dev.off()
###
#####
##############################################################################################
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.