additional_file/molecule_classification.R

##############
#This file should be run in the second. 
#This file takes input the 'gene.go.mat' RData file generated by the 'molecule_go_map.R' file.
#In a separate drectory, store the 'molecule_classification.R' file and also create a folder name as 'object' in the current directory and store the 'gene.go.mat' RData file here.
#This R file contains the code for the random forest classification of the pathway molecules GO matrix.
#First we will partition the matrix data for train and test types.
#Next we will build the classification model with the train data and then test the test data. 
#!!!Please note that building the classification model with large number of features (here GO IDs) will take several hours.
#After that we will tune the classification model to set the optimum parameters for the model. 
#After tuning we will again build the model with optimum parameters and test the test data.
#Next we will collect the top 50 features (GO terms) from the model, and then build the model again with only these 50 features.
#!!!Here, building the classification model with only 50 features will not take much time.
#Then we will test the test data several times with no seed values.
#Each time hopefully we will get better accracy results.
#Finally we will save the top 50 features in the result directory of the current directory to use in the next file "molecule_classification_model.R"
##############




######################################################################################
###################### For Random Forest Classification ##############################
##Read Data
load("object/genes.go.mat.RData")
#it is very much required by random forest (data frame) - less error prone
genes.go.df<-as.data.frame(genes.go.mat)
#making the col names legal - here ':' changed to '.' from the colnames part
names(genes.go.df) <- make.names(names(genes.go.df))
genes.go.df$type <- as.factor(genes.go.df$type)
table(genes.go.df$type)
#  1    2    3    4    5 
#672  644  440  997 2456  


##Data Partition, training-70% and test-30%
set.seed(222)
ind <- sample(2, nrow(genes.go.df), replace = TRUE, prob = c(0.7, 0.3))
train.genes.go.df <- genes.go.df[ind==1,]
dim(train.genes.go.df)
#[1]  3648 12677
test.genes.go.df <- genes.go.df[ind==2,]
dim(test.genes.go.df)
#[1]  1561 12677


##To look at the number of individual pathway molecules 
table(train.genes.go.df$type)
#  1    2    3    4    5 
#484  441  304  682 1737 
table(test.genes.go.df$type)
#  1   2   3   4   5 
#188 203 136 315 719


##Random Forest
#install.packages("randomForest")
library(randomForest)
set.seed(123)


#####First time to build the model
initial.rf <- randomForest(type~., data=train.genes.go.df)

#To check the confusion matrix and OOB estimate of error rate
initial.rf$confusion
print(initial.rf)
#when number of trees is 500 OOB estimate of error rate: 11.49%
#we will tune the model by resetting the parameter values and hope able to get some improved results 


##Prediction & Confusion Matrix - train data
#install.packages("caret")
library(caret)   ##this is for confusionMatrix()
initial.train.prediction <- predict(initial.rf, train.genes.go.df)
confusionMatrix(initial.train.prediction, train.genes.go.df$type)

#Prediction & Confusion Matrix - test data
initial.test.prediction <- predict(initial.rf, test.genes.go.df)
confusionMatrix(initial.test.prediction, test.genes.go.df$type)
##the overall accuracy for train data is 0.9564 and for test data is 0.884


#Error rate of Random Forest, from here we will get the 'ntreeTry' parameter value
plot(initial.rf)

#Tune mtry
tune.initial.rf <- tuneRF(train.genes.go.df[,-1], train.genes.go.df[,1],
                          stepFactor = 0.5,
                          plot = TRUE,
                          ntreeTry = 300,
                          trace = TRUE,
                          improve = 0.05)

#Here we can see that when ntreeTry=300 we can get lowest OOB error=11.27% for mtry=224
#####End of first time running the model





#####After tuning build and test the model again with the revised parameter values
#####Here, we can see the improvement of the model
after.tune.rf <- randomForest(type~., data=train.genes.go.df,
                              ntree = 300,
                              mtry = 224,
                              importance = TRUE,
                              proximity = TRUE)

#Now check the confusion matrix and OOB estimate of error rate again 
after.tune.rf$confusion
print(after.tune.rf)
#when number of trees is 300 OOB estimate of error rate: 11.21%
#

##Prediction & Confusion Matrix - train data
#library(caret)   ##this is for confusionMatrix()
after.tune.train.prediction <- predict(after.tune.rf, train.genes.go.df)
confusionMatrix(after.tune.train.prediction, train.genes.go.df$type)

#Prediction & Confusion Matrix - test data
after.tune.test.prediction <- predict(after.tune.rf, test.genes.go.df)
confusionMatrix(after.tune.test.prediction, test.genes.go.df$type)
##we see that after tuning the overall accuracy of the model is higher than the previous model
##previously the overall accuracy for train data was 0.9564  and for test data was 0.884
##after tuning the overall accuracy for train data is 0.9707 and for test data is 0.8847
##


##Now get top 50 variables / features based on MeanDecreaseGini and 
##then build the model again with this top features and then test
#get the top 50 variables / features
rf.importance<-importance(after.tune.rf, type = 2)
rf.importance.rank<-rf.importance[order(rf.importance[,1], decreasing = T),]
rf.importance.rank.top50<-names(rf.importance.rank[1:50])
#


## save the top 50 features
#top.50.feature<-rf.importance.rank.top50
#save(top.50.feature, file = "result/top.50.feature.RData")
##


##get the data frame with only these top 50 features
genes.go.df.selected<-genes.go.df[,c("type", rf.importance.rank.top50)]
dim(genes.go.df.selected)
#[1] 5209   51
##


###This section for testing of number of times when no seed value is used###
##Data Partition
#set.seed(222)
ind.selected <- sample(2, nrow(genes.go.df.selected), replace = TRUE, prob = c(0.7, 0.3))
train.selected.df <- genes.go.df.selected[ind.selected==1,]
dim(train.selected.df)
#[1]  3646 51
test.selected.df <- genes.go.df.selected[ind.selected==2,]
dim(test.selected.df)
#[1]  1599 51
##

##to see the number of individual molecule in the train and test data
#table(train.selected.df$type)
#  1    2    3    4    5 
#467  443  298  710 1728
#table(test.selected.df$type)
#  1   2   3   4   5 
#213 193 142 287 764 
##

##Now build the model and predict the test data
rf.selected <- randomForest(type~., data=train.selected.df)
rf.selected$confusion

#Prediction & Confusion Matrix - selected train data
train.selected.prediction <- predict(rf.selected, train.selected.df)
confusionMatrix(train.selected.prediction, train.selected.df$type)

#Prediction & Confusion Matrix - selected test data
test.selected.prediction <- predict(rf.selected, test.selected.df)
confusionMatrix(test.selected.prediction, test.selected.df$type)
##
###This section for testing of number of times when no seed value is used###
###!!!For top 50 variable we have got accuracy around 0.90 (0.8689 - 0.9106) all times!!!



###variable importance
##testing for importance of the variables
importance(rf.selected)
varUsed(rf.selected)    ##how many times each variable is used

#to plot the importance of the features by sorting
#varImpPlot(rf.selected, sort = TRUE, main = "top10 features sorted")
###



###partial plot for the importance of each variable for each class
class.name<-c("1","2","3","4","5")
var.name<-rownames(importance(rf.selected))
#pdf("partial_plot_for_features.pdf", paper = "a4")
#pdf("partial_plot_for_features.pdf")
pdf("partial_plot_for_features.pdf", paper = "letter")
for(m in 1: length(class.name)){
  #par(mfrow=c(5,2), xpd=NA)
  par(mfrow=c(5,2))
  for (i in 1:length(var.name)){
    main.text<-paste("Partial Dependence on", var.name[i], sep = " ")
    main.text.2<-paste(main.text, class.name[m], sep = " for class ")
    partialPlot(rf.selected, train.selected.df, var.name[i], class.name[m],
                main=main.text.2, xlab = var.name[i])
  }
}
dev.off()
### 
#####
##############################################################################################
humayun2017/SPAGI2 documentation built on Aug. 5, 2020, 12:06 a.m.