# This is a pipeline of different gene selection and class prediction methods to identify a set of genes for predicting known classes
# edata: is the gene expression data with samples on the column.
# labs: is a vector with known classes (samples should be matched to the edata)
# seqp: is a vector with sequence of the number of to genes to selected
# fac: is a proportion how the data should be split for cross validation
# seeed: is a number used to fix results for reproducibility
intpred <- function(edata,labs,seqp,fac,seed=166)
{
#### Required packages
installed<-installed.packages()[,1]
required<-c("e1071","randomForest","rpart","MASS","siggenes",
"multtest","samr","pamr","RColorBrewer",'devtools')
toinstall<-required[!(required %in% installed)]
if(length(toinstall) != 0){
source("https://bioconductor.org/biocLite.R")
biocLite(toinstall) }
lapply(required, require, character.only = TRUE)
requirfMM<-c("sma")
toinstallfMM<-requirfMM[!(requirfMM %in% installed)]
if(length(toinstallfMM) != 0){ install_github("gnyamundanda/sma") }
lapply(requirfMM, require, character.only = TRUE)
#### GeneSelection folder for storing all results
dir.create("GeneSelection", showWarnings = FALSE)
setwd("GeneSelection")
dir.create("indata", showWarnings = FALSE)
dir.create("result", showWarnings = FALSE)
dir.create("temp", showWarnings = FALSE)
dir.create("topgenes", showWarnings = FALSE)
pad='indata' # training datasets storage
padout='result' # results of gene selection and misclassification
padout2='temp' # temporary results storage
#### prepare data for the classification problem
set.seed(seed) # set seed to remove randomness due to sampling
labels<-labs; length(labels)
ord_lables<-order(labels) # order labels
labs<-labels[ord_lables]
expdata<-edata[,ord_lables] # order exp data according to ordered labels
resp<-as.numeric(labs)
nsubs=length(names(table(resp)))
samplez<-paste(c("sample_"),1:length(labs),sep="") # create new naming for samples
resp1<-as.data.frame(cbind(samplez,resp))
genames<-paste(c("gene_"),1:nrow(expdata),sep="") # create new naming for genes
expd<-expdata
rownames(expd)<-genames
colnames(expd)<-samplez
array<-as.data.frame(expd) # array is the expession data (genes on rows)
#### setting up parameters
nsamp=50 # simulation size
nrmethNames=c('pam','ps','bw')
nrmeth=length(nrmethNames) # number of feature seletion methods
nrmethclNames=c('rf','dlda','svmln','svmrd')
nrmethcl=length(nrmethclNames) # number of class prediction methods
response=resp
narray=length(response)
dataset=trainTest(response, array, fac) # generate training and testing data only to for setting up matrices
dataset1=as.data.frame(dataset$Learning_set)
dataset2=as.data.frame(dataset$Test_set)
nLarray=nrow(dataset1) # number of training samples (its fixed by fac)
nTarray=nrow(dataset2) # number of test samples (its fixed by fac)
test.idx=rep(0,narray) # Matrix to store the sample selected as test data for each of the nsamp runs
p=seqp # sequence of gene selection
np=length(p) # nr of gene sets
ngenes=nrow(array) # nr of genes
unlistfn<-function(x,g,byrow) matrix(unlist(x),ncol=g,byrow=byrow)
######################
# 1st. gene selection
######################
resgen=lapply(1:nsamp,function(k) gsetfn(response,array,fac,narray,np,p,nrmeth,ngenes,nsubs,test.idx))
mat.train=unlistfn(lapply(resgen[1:nsamp],function(x){x['mat.tr'][[1]]}),nLarray,byrow=TRUE)
mat.test=unlistfn(lapply(resgen[1:nsamp],function(x){x['mat.ts'][[1]]}),nTarray,byrow=TRUE)
test.indexmatrix=unlistfn(lapply(resgen[1:nsamp],function(x){x['ts.idx'][[1]]}),narray,byrow=TRUE)
nameIndex=unlistfn(lapply(resgen[1:nsamp],function(x){x['nameIdx'][[1]]}),nTarray,byrow=TRUE)
Results=lapply(resgen[1:nsamp],function(x){x['res'][[1]]})
############## Storage of gene selection results
# Storage matrix indicating which sample was selected as test data at each resampling: 1 identifies the test sample
testIndex=paste(padout2,'/','testIndex','.','txt',sep='')
write.table(test.indexmatrix, file=testIndex, append=FALSE, quote=F, sep=" ", eol="\n", na="NA", dec = ".", row.names = F,
col.names=c(1:narray), qmethod= c("escape"))
# Storage matrix with indexes samples selected in test data for each random sampling.
nameIndex2 <- paste(padout2,'/','nameIndex','.','txt',sep='')
write.table(nameIndex, file=nameIndex2, append=FALSE, quote=F, sep=" ", eol="\n", na="NA", dec=".", row.names=F,
col.names = c(1:nTarray), qmethod = c("escape"))
# Matrix for storing the names of samples in the training set for each random sampling
trainMatrix=paste(pad,'/','trainMatrix','.','txt',sep='')
write.table( mat.train, file = trainMatrix, append = FALSE, quote = F,sep = " ",eol = "\n", na = "NA", dec = ".", row.names =F ,
col.names = c(1:nLarray), qmethod = c("escape"))
# Matrix for storing the names of samples in the test set for each random sampling
testMatrix=paste(pad,'/','testMatrix','.','txt',sep='')
write.table( mat.test, file = testMatrix, append = FALSE, quote = F,sep = " ",eol = "\n", na = "NA", dec = ".", row.names =F ,
col.names = c(1:nTarray), qmethod = c("escape"))
# Gene selection output: storing results of each gene selection output
for(k in 1:nsamp){
for(i in 1:np){
gfile=paste(padout,'/','simulation_',k,'_top_',p[i],'.','txt',sep='')
write.table(Results[[k]][[i]], file = gfile, append = FALSE, quote = F, sep = " ",
eol = "\n", na = "NA", dec = ".", row.names = F, col.names = TRUE, qmethod = c("escape"))}}
filetemp=paste(padout,'/','feature_selection_worksp.RData',sep='')
save.image(filetemp)
#####
#######################
# 2nd. class prediction
#######################
# storing number of errors and total number of samples considered per each simulation, each feature sel method in combn with class pred method
method=list()
for(l in 1:nrmeth){ # running over number of feature selection methods
method[[l]]=list()
for(cp in 1:nrmethcl){ method[[l]][[cp]]=matrix(0,nsamp,np*2) } }
for(k in 1:nsamp){ # run over number of simulations
LS.resp=resp1[sapply(mat.train[k,], function(x){which(x==resp1[,1])} ),2] # matching the samples
TS.resp=resp1[sapply(mat.test[k,], function(x){which(x==resp1[,1])} ),2]
TrLS=t(array[,sapply(mat.train[k,], function(x){which(x==resp1[,1])})]) # matching the samples to obtain exp data
TrTS=t(array[,sapply(mat.test[k,], function(x){which(x==resp1[,1])})])
LS=t(TrLS)
TS=t(TrTS)
TrLS1=data.frame(TrLS,LS.resp)
TrTS1=data.frame(TrTS,TS.resp)
for(i in 1:np){ # run over the number top p[i] features
gfile=paste(padout,'/simulation_',k,'_top_',p[i],'.txt',sep='') # find selected gene set for the top nr of genez (i) and sampling (k)
Res2=read.table(file = gfile, header = TRUE, quote = "", sep = " ", na = "NA", dec = ".")
for(l in 1:nrmeth){ # run over each feature selection method
## find the gene set selected by each feature selection method
LStop=Res2[,l] # select gene set for each gene selection method(l), top nr of genez (i) and sampling (k)
TrLS2=data.frame(TrLS[,LStop],LS.resp) # select training set associated with the gene set
TrTS2=data.frame(TrTS[,LStop],TS.resp) # select testing set
#####
## RF
LS.rf2<-randomForest(as.factor(LS.resp)~.,data=TrLS2,importance=TRUE,proximity=TRUE)
TS.pred2<-predict(LS.rf2,TrTS2) # predict the classes using RF
e3=table(TS.pred2,TS.resp)
method[[l]][[1]][k,i]=sum(TS.pred2!=TS.resp) # number of misclassified samples
method[[l]][[1]][k,i+np]=sum(e3) # total number of samples
########
## DLDA
disc.dlda=stat.diag.da(TrLS[,LStop],TrTS[,LStop],cl=as.integer(LS.resp),pool=1)[[1]]
f3=table(disc.dlda,TS.resp)
method[[l]][[2]][k,i]=sum(disc.dlda!=TS.resp)
method[[l]][[2]][k,i+np]=sum(f3)
##############
## SVM-Linear
model1=svm(as.factor(LS.resp)~.,data=TrLS2, kernel="linear")
pred1=predict(model1, newdata=TrTS2)
g3=table(pred1,TS.resp)
method[[l]][[3]][k,i]=sum(pred1!=TS.resp)
method[[l]][[3]][k,i+np]=sum(g3)
##############
## SVM-Radial
model1=svm(as.factor(LS.resp)~.,data=TrLS2, kernel="radial")
pred1=predict(model1, newdata=TrTS2)
g4=table(pred1,TS.resp)
method[[l]][[4]][k,i]=sum(pred1!=TS.resp)
method[[l]][[4]][k,i+np]=sum(g4)
} # end l
} # end i
topcolnames<-c(paste0("Top",p),paste0("Top",p,"T"))
cpm=paste0(nrmethclNames,'file')
for(l in 1:nrmeth){
for(cp in 1:nrmethcl){
ffile=paste0(padout2,'/',cpm[cp],'-',nrmethNames[l],'.txt')
write.table(method[[l]][[cp]], file=ffile, append=FALSE,quote=F, sep=" ",eol="\n",na="NA",
dec=".",row.names=F,col.names=topcolnames,qmethod=c("escape"))}}
} # end k
#####################
# Assess performance
#####################
# Misclassification Error Rate (MCR)
MCR=lapply(1:nrmeth,function(l){lapply(1:nrmethcl,function(m){method[[l]][[m]][1:nsamp,1:np]/nTarray })})
# mean misclassification error rate for ALL data partition
MCR2=lapply(1:nrmeth,function(l){lapply(1:nrmethcl,function(m){ colMeans(MCR[[l]][[m]]) })})
# sd misclassification error rate for ALL data partition
MCR3=lapply(1:nrmeth,function(l){lapply(1:nrmethcl,function(m){ apply(MCR[[l]][[m]],2,sd) })})
## Rearraing the MCR2 and MCR3 into a data frame of class pred methods, feature sel methods and the top feeatures
MCRresult=MCRstd=matrix(0,nrmeth*nrmethcl,np+2)
for(m in 1:nrmethcl){ for(l in 1:nrmeth){
tm=(m-1)*nrmeth+l
MCRresult[tm,1]=MCRstd[tm,1]=nrmethclNames[m]
MCRresult[tm,2]=MCRstd[tm,2]=nrmethNames[l]
MCRresult[tm,3:(np+2)]=round(MCR2[[l]][[m]][1:np],2)
MCRstd[tm,3:(np+2)]=round(MCR3[[l]][[m]][1:np],2)
}
}
MCRresult=data.frame(MCRresult)
MCRstd=data.frame(MCRstd)
filetemp2=paste(padout,'/','MCRmeangenes.txt',sep='')
filetemp3=paste(padout,'/','MCRstdgenes.txt',sep='')
testIndex=paste(padout2,'/','testIndex','.','txt',sep='')
filetemp=paste(padout,'/','allworksp.RData',sep='')
write.table(MCRresult,file=filetemp2,append=FALSE,quote=F,sep=" ",eol="\n",na="NA",dec=".",row.names=F,qmethod=c("escape"))
write.table(MCRstd,file=filetemp3,append=FALSE,quote=F,sep=" ",eol="\n",na="NA",dec=".",row.names=F,qmethod=c("escape"))
write.table(test.indexmatrix,file=testIndex,append=FALSE,quote=F,sep=" ",eol="\n",na="NA",dec=".",row.names=F,col.names=c(1:narray), qmethod=c("escape"))
save.image(filetemp)
########
# Plots
#########
# transform the average and std mcr into np by different classifier
means.all<-read.delim("result/MCRmeangenes.txt", header = TRUE, quote = "", sep = "", dec = ".")
std.all<-read.delim("result/MCRstdgenes.txt", header = TRUE, quote = "", sep = "", dec = ".")
order.means<-means.all[order(means.all$X2),]
splitter<-order.means$X2 # split average mcr based on gene selection methods
parts<-split(order.means,splitter)
order.std<-std.all[order(std.all$X2),]
partsd<-split(order.std,splitter) # split std mcr based on gene selection methods
cmean<-csd<-mnames<-NULL
for(i in 1:length(parts)){
cmean<-cbind(cmean,matrix(as.numeric(t(parts[[i]])[-c(1,2),]),np,nrmethcl))
csd<-cbind(csd,matrix(as.numeric(t(partsd[[i]])[-c(1,2),]),np,nrmethcl))
mnames<-c(mnames,paste0(names(parts)[i],'_',nrmethclNames))}
ps.mean<-cbind(p,cmean)
ps.std<-cbind(p,csd)
colnames(ps.mean)<-colnames(ps.std)<-c("Top",mnames)
# mcr figure
pdf(file=paste0(Sys.Date(),"_misclassification_error_rate.pdf"),onefile=TRUE,pointsize=16,width=10, height=10)
par(mar=c(4,4,2,1))
colg<-brewer.pal(n=12,name="Paired"); colg[11]<-"grey50"
plotMCR(colg,ps.mean,ps.std,nrmethcl,nrmeth,p,np,nsamp)
dev.off()
gfile1=paste0(Sys.Date(),'_MCR.txt')
gfile2=paste0(Sys.Date(),'_MCR_std.txt')
write.table(ps.mean,file=gfile1,append=FALSE,quote=F,sep="\t",row.names=F,col.names=TRUE)
write.table(ps.std,file=gfile2,append=FALSE,quote=F,sep="\t",row.names=F,col.names=TRUE)
#################
# Selected genes
#################
# final set of genes with the least MER
# 1st. based on common genes among different gene selection methods at the with the optimal number if genes
indx=rowSums(ps.mean[,-1]==min(ps.mean[,-1]))
idx=((1:nrow(ps.mean))[indx>0])[1] # identify the smallest top nr of genes with least mcr
msng<-matrix(NA,(nrow(array)-p[idx]),nsamp*nrmeth) # matrix for genes which were NOT selected by each gene sel metho at each sampling
cnt=0
for(k in 1:nsamp){ # each sampling
gfile3=paste(padout,'/simulation_',k,'_top_',p[idx],'.txt',sep='')
for(j in 1:nrmeth){ # each gene sel method
cnt=cnt+1
ptop=read.table(file=gfile3, header=TRUE, quote="", sep=" ", na="NA", dec=".")
msng[,cnt]<-c(1:nrow(array))[-ptop[,j]] # identify the index of genes which were NOT being selected
}}
sgenes<-sort(table(unlist(msng)),decreasing=TRUE) # sort by genes which were frequently NOT selected
rmgenes=as.numeric(names(sgenes)[1:(nrow(array)-p[idx])]) # select the top (total nr of genes - selected top nr of genes) genes were frequently NOT selected
topgenes<-rownames(expdata)[-rmgenes]
gfile4=paste0('topgenes/',Sys.Date(),'_top_',p[idx],'_genes_based_on_most_commonly_selected_genes.txt')
write.table(topgenes,file=gfile4,append=FALSE,quote=F,sep="",eol="\n",na="NA",dec=".",row.names=F,col.names=TRUE,qmethod=c("escape"))
# 2nd. based on the best gene selection method
nmdx=((colnames(ps.mean)[-1])[ps.mean[idx,-1]==min(ps.mean[idx,-1])])[1] # find best gene selection method
mdx=strsplit(nmdx,'_')[[1]][1] # name of the best gene selection method
fsng<-matrix(NA,(nrow(array)-p[idx]),nsamp) # matrix of genes which were NOT selected by best gene sel metho at each sampling
for(k in 1:nsamp){ gfile5=paste(padout,'/','simulation_',k,'_top_',p[idx],'.','txt',sep='')
fptop=read.table(file=gfile5, header=TRUE, quote="", sep=" ", na="NA", dec=".")
fsng[,k]<-c(1:nrow(array))[-fptop[,nrmethNames==mdx]] # identify those genes which are NOT being selected
}
msgenes<-sort(table(unlist(fsng)),decreasing=TRUE) # sort by genes which were frequently NOT selected
mrmgenes=as.numeric(names(msgenes)[1:(nrow(array)-p[idx])]) # select the top (p - best number of genes) genes were frequently NOT selected
mtopgenes<-rownames(expdata)[-mrmgenes]
gfile6=paste0('topgenes/',Sys.Date(),'_top_',p[idx],'_genes_based_on_',mdx,'_gene_selection_method.txt')
write.table(mtopgenes,file=gfile6,append=FALSE,quote=F,sep="",eol="\n",na="NA",dec=".",row.names=F,col.names=TRUE,qmethod=c("escape"))
# 3rd. final set of genes used in the paper
idx=19 # top 38 genes
pmsng<-matrix(NA,(nrow(array)-p[idx]),nsamp*nrmeth)
cnt=0
for(k in 1:nsamp){ gfile7=paste(padout,'/','simulation_',k,'_top_',p[idx],'.','txt',sep='')
for(j in 1:nrmeth){
cnt=cnt+1
pptop=read.table(file = gfile7, header = TRUE, quote = "", sep = " ", na = "NA", dec = ".")
pmsng[,cnt]<-c(1:nrow(array))[-pptop[,j]]
}}
psgenes<-sort(table(unlist(pmsng)),decreasing = TRUE)
prmgenes=as.numeric(names(psgenes)[1:(nrow(array)-p[idx])])
ptopgenes<-rownames(expdata)[-prmgenes]
gfile8=paste0('topgenes/',Sys.Date(),'_top_38_genes_based_on_most_commonly_selected_genes_used_in_the_paper.txt')
write.table(ptopgenes,file=gfile8,append=FALSE,quote=F,sep="",eol="\n",na="NA",dec=".",row.names=F,col.names=TRUE,qmethod=c("escape"))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.