BioSeqClass.R
In BioSeqClass: Classification for Biological Sequences

### R code from vignette source 'BioSeqClass.Rnw'

###################################################
### code chunk number 1: install (eval = FALSE)
###################################################
## if (!requireNamespace("BiocManager", quietly=TRUE))
##     install.packages("BiocManager")
## BiocManager::install("BioSeqClass")


###################################################
### code chunk number 2: install2 (eval = FALSE)
###################################################
## source("http://www.biosino.org/download/BioSeqClass/BioSeqClass.R")
## BioSeqClass()


###################################################
### code chunk number 3: load
###################################################
library(BioSeqClass)


###################################################
### code chunk number 4: homologReduction
###################################################
  library(BioSeqClass)
  # Example data in BioSeqClass.
  file=file.path(path.package("BioSeqClass"),"example","acetylation_K.fasta")
  posfile = file.path(path.package("BioSeqClass"), 
       "example", "acetylation_K.site")
  # Only a part of lysine acetylation sites are used for demo.
  posfile1=tempfile()
  write.table(read.table(posfile,sep='\t',header=F)[1:20,], posfile1, 
    sep='\t', quote=F, row.names=F, col.names=F)
  seqList = getTrain(file, posfile1, aa="K", w=7, identity=0.4)


###################################################
### code chunk number 5: readData
###################################################
  tmpDir=file.path(path.package('BioSeqClass'), 'example')
  tmpFile1=file.path(tmpDir, 'acetylation_K.pos40.pep')
  tmpFile2=file.path(tmpDir, 'acetylation_K.neg40.pep')
  posSeq=as.matrix(read.csv(tmpFile1,header=F,sep='\t',row.names=1))[,1]
  negSeq=as.matrix(read.csv(tmpFile2,header=F,sep='\t',row.names=1))[,1]
  seq=c(posSeq,negSeq)  
  classLable=c(rep("+1",length(posSeq)),rep("-1",length(negSeq)) ) 
  length(seq)


###################################################
### code chunk number 6: featureSelect
###################################################
  # Use 0-1 binary coding.
  feature1 = featureBinary(seq,elements("aminoacid"))
  dim(feature1)
  # Use the compostion of paired amino acids.
  feature2 = featureGapPairComposition(seq,0,elements("aminoacid"))
  dim(feature2)


###################################################
### code chunk number 7: classModel
###################################################
  data = data.frame(feature1,classLable)
  # Use support vector machine and 5 fold cross validation to do classification.
  LIBSVM_CV5 = classify(data, classifyMethod='libsvm',
               cv=5, svm.kernel='linear',svm.scale=F)
  LIBSVM_CV5[["totalPerformance"]]
  # Features selection is done by envoking "CfsSubsetEval" method in WEKA.
  FS_LIBSVM_CV5 = classify(data, classifyMethod='libsvm',
                  cv=5, evaluator='CfsSubsetEval', search='BestFirst',
                  svm.kernel='linear', svm.scale=F)
  
  FS_LIBSVM_CV5[["totalPerformance"]] ## Accuracy is increased by feature selection.
  # Selected features:
  colnames(data)[FS_LIBSVM_CV5$features[[1]]]


###################################################
### code chunk number 8: featureTest (eval = FALSE)
###################################################
##   fileName = tempfile()
##   # Note: It may be time consuming.
##   testFeatureSet = featureEvaluate(seq, classLable, fileName, cv=5, 
##     ele.type='aminoacid', featureMethod=c('Binary','GapPairComposition'),
##     classifyMethod='libsvm',
##     group=c('aaH', 'aaV', 'aaZ', 'aaP', 'aaF', 'aaS', 'aaE'), g=0,
##     hydro.methods=c('kpm', 'SARAH1'), hydro.indexs=c('hydroE', 'hydroF', 'hydroC') )
##   summary = read.csv(fileName,sep="\t",header=T)      
##   # Plot the result of 'featureEvaluate'
##   require("scatterplot3d")     
##   tmp1 = summary[,"Feature_Function"]
##   tmp2 = as.factor(sapply(as.vector(summary[,'Feature_Parameter']),
##          function(x){unlist(strsplit(x,split='; '))[1]}))
##   testResult = data.frame(as.integer(tmp2), as.integer(tmp1), summary[,"acc"])
##   s3d=scatterplot3d(testResult,
##       color=c('red','blue')[testResult[,2]], pch=19, xlab='Parameter',
##       ylab='Feature Coding',
##       zlab='Accuracy', lab=c(9,3,7),       
##       x.ticklabs=gsub('class: ','',sort(unique(tmp2))), 
##       type='h',ylim=c(0,3),y.margin.add=2.5,
##       y.ticklabs=c('',gsub('feature','',sort(unique(tmp1))),'') )


###################################################
### code chunk number 9: featureComSelect (eval = FALSE)
###################################################
##   feature.index = 1:3
##   tmp <- testFeatureSet[[1]]$data
##   tmp1 <- testFeatureSet[[feature.index[1]]]$model
##   colnames(tmp) <- paste(
##     tmp1["Feature_Function"],
##     tmp1["Feature_Parameter"],
##     colnames(tmp),sep=" ; ")      
##   data <- tmp[,-ncol(tmp)]  
##   for(i in 2:length(feature.index) ){
##     tmp <- testFeatureSet[[feature.index[i]]]$data
##     tmp1 <- testFeatureSet[[feature.index[i]]]$model
##     colnames(tmp) <- paste(
##       tmp1["Feature_Function"],
##       tmp1["Feature_Parameter"],
##       colnames(tmp),sep=" ; ")    
##     data <- data.frame(data, tmp[,-ncol(tmp)] )
##   }
##   name <- colnames(data)
##   data <- data.frame(data, tmp[,ncol(tmp)] )  ## Combined features
##   # Use 'selectFFS' to do feature forward selection.
##   # Note: It may be time consuming.
##   combineFeatureResult = selectFFS(data,accCutoff=0.005,
##     classifyMethod="knn",cv=5)  ## It is time consuming.
##   tmp = sapply(combineFeatureResult,function(x){
##     c(length(x$features),x$performance["acc"])})
##   plot(tmp[1,],tmp[2,],xlab="Feature Number",ylab="Accuracy",
##     , pch=19)
##   lines(tmp[1,],tmp[2,])


###################################################
### code chunk number 10: BioSeqClass.Rnw:488-489
###################################################
sessionInfo()