Nothing
#' Bp Barcoding Species Identify using Kmer
#'
#' @description Species identification using BP-based method for both protein-coding
#' barcodes, for instance, COI, and non-coding barcodes, such as, ITS, using kmer statistics.
#'
#' @param ref object of class "DNAbin" used as a reference dataset, which contains taxon information.
#' @param que object of class "DNAbin", which needs to be inferred.
#' @param kmer a numeric indicating the length of kmer used.
#' @param UseBuiltModel logic value to indicate whether a built model is used
#' or not.
#' @param lr parameter for weight decay. Default 5e-5.
#' @param maxit maximum number of iterations. Default 1e+6.
#' @return a list containing model parameters used, species identification success rates using references,
#' query sequences, species inferred, and corresponding confidence levels (bp probability for BP-based method).
#' @keywords bbsik
#' @export
#' @import nnet
#' @author Ai-bing ZHANG, Meng-di HAO, Cai-qing YANG, CNU, Beijing, CHINA. zhangab2008 (at) mail. cnu. edu.cn
#' @references
#' Zhang, A. B., D. S. Sikes, C. Muster, S. Q. Li. (2008). Inferring Species Membership
#' using DNA sequences with Back-propagation Neural Networks. Systematic Biology, 57(2):202-215.
#' https://academic.oup.com/sysbio/article/57/2/202/1622290
#'
#'
#' @examples
#' data(TibetanMoth)
#' ref<-as.DNAbin(as.character(TibetanMoth[1:50,]))
#' que<-as.DNAbin(as.character(TibetanMoth[51:60,]))
#' out<-bbsik(ref, que, kmer = 1, UseBuiltModel = FALSE)
#' out
#' out$convergence
#' out$success.rates.ref
#'
#' data(pineMothITS2)
#' ref<-pineMothITS2
#' que<-pineMothITS2
#' out<-bbsik(ref, que, kmer = 1, UseBuiltModel = FALSE)
#' out
#' out$convergence
#' out$success.rates.ref
#'
#'
#'
bbsik<-function (ref, que, kmer = kmer, UseBuiltModel = FALSE,lr=5e-5, maxit=1e+6) {
### functions used!
DNAbin2kmerFreqMatrix2<-function(ref,que,kmer=kmer){
### return kmer frequency matrices for both ref and que sequences, but only based on kmers found in ref!!!
### new kmers in que will be ignored
#require(seqinr)
#kmer<-1
### 1. check the format of input arguments
if (class(ref)!="DNAbin")
stop("seqs should be in DNAbin format!")
if (class(kmer)!="integer") kmer<-as.integer(kmer)
### 2. seek unique.kmer.vector for all seqs: u.s
seqs.as.char<-as.character(ref)
seqs.as.char2<-as.character(que)
ifelse(is.vector(seqs.as.char),seqs.as.str.vector<-lapply(seqs.as.char,FUN=c2s),seqs.as.str.vector<-apply(seqs.as.char, MARGIN=1,FUN=c2s))
ifelse(is.vector(seqs.as.char2),seqs.as.str.vector2<-lapply(seqs.as.char2,FUN=c2s),seqs.as.str.vector2<-apply(seqs.as.char2, MARGIN=1,FUN=c2s))
seqs.unique.as.str.vector<-unique(seqs.as.str.vector)
s<- seqs.unique.as.str.vector
u.s0<-unique(substring(s, 1, kmer)) ### check along the column first!
u.s<-u.s0
#length(u.s)
for (i in 2:(max(nchar(s))-kmer+1)){
# i<-2
#u.s<-u.s0 ### unique str
u.s<-c(u.s,unique(substring(s, i, kmer+i-1)))
u.s<-unique(u.s)
n.char<-nchar(u.s)
size.kmer.exact<-n.char==kmer ### logic to remove short kmer!
u.s<-subset(u.s,size.kmer.exact)
} ### the end of for-loop
### do some cleaning by removing IUPAC codes, "-"
mpattern<-"-+[a-z]*"
u.s<-gsub(mpattern,NA,u.s) # strings with "-"
mpattern<-"[rymkswhbvdn]+"
u.s<-gsub(mpattern,NA,u.s) # strings with "-"
u.s<- u.s[!is.na(u.s)]
### 3. calculate kmer frequency for each sequence in ref
#b<-gregexpr(u.s[1],seqs.as.str.vector[1])
kmer.freq.one.seq<-sapply(u.s,gregexpr,seqs.as.str.vector[1])
kmer.freq.one.seq2<-numeric(length(kmer.freq.one.seq))
for (i in 1:length(kmer.freq.one.seq)){
#kmer.freq.one.seq2[i]<-length(kmer.freq.one.seq[[i]])
ifelse(kmer.freq.one.seq[[i]]==-1,kmer.freq.one.seq2[i]<-0,kmer.freq.one.seq2[i]<-length(kmer.freq.one.seq[[i]]))
}
kmer.freq.one.seq3<- kmer.freq.one.seq2
for (k in 2:length(seqs.as.str.vector)){
# k<-2
kmer.freq.one.seq<-sapply(u.s,gregexpr,seqs.as.str.vector[k])
#b<-lapply(u.s,gregexpr,seqs.as.str.vector[1])
kmer.freq.one.seq2<-numeric(length(kmer.freq.one.seq))
for (i in 1:length(kmer.freq.one.seq)){
ifelse(kmer.freq.one.seq[[i]]==-1,kmer.freq.one.seq2[i]<-0,kmer.freq.one.seq2[i]<-length(kmer.freq.one.seq[[i]]))
}
kmer.freq.one.seq3<- c(kmer.freq.one.seq3,kmer.freq.one.seq2)
} ### end of k-for-loop
kmer.freq.matrix<-t(array(kmer.freq.one.seq3,dim=c(length(kmer.freq.one.seq),length(kmer.freq.one.seq3)%/%length(kmer.freq.one.seq))))
#kmer.freq.matrix<-t(array(kmer.freq.one.seq3,dim=c(length(kmer.freq.one.seq3)%/%length(kmer.freq.one.seq),length(kmer.freq.one.seq)))) ### error!?
rownames(kmer.freq.matrix)<-rownames(ref)
### 4. calculate kmer frequency for each sequence in que
kmer.freq.one.seq<-sapply(u.s,gregexpr,seqs.as.str.vector2[1])
kmer.freq.one.seq2<-numeric(length(kmer.freq.one.seq))
for (i in 1:length(kmer.freq.one.seq)){
#kmer.freq.one.seq2[i]<-length(kmer.freq.one.seq[[i]])
ifelse(kmer.freq.one.seq[[i]]==-1,kmer.freq.one.seq2[i]<-0,kmer.freq.one.seq2[i]<-length(kmer.freq.one.seq[[i]]))
}
kmer.freq.one.seq3<- kmer.freq.one.seq2
for (k in 2:length(seqs.as.str.vector2)){
# k<-2
kmer.freq.one.seq<-sapply(u.s,gregexpr,seqs.as.str.vector2[k])
#b<-lapply(u.s,gregexpr,seqs.as.str.vector[1])
kmer.freq.one.seq2<-numeric(length(kmer.freq.one.seq))
for (i in 1:length(kmer.freq.one.seq)){
ifelse(kmer.freq.one.seq[[i]]==-1,kmer.freq.one.seq2[i]<-0,kmer.freq.one.seq2[i]<-length(kmer.freq.one.seq[[i]]))
}
kmer.freq.one.seq3<- c(kmer.freq.one.seq3,kmer.freq.one.seq2)
} ### end of k-for-loop
kmer.freq.matrix2<-t(array(kmer.freq.one.seq3,dim=c(length(kmer.freq.one.seq),length(kmer.freq.one.seq3)%/%length(kmer.freq.one.seq))))
#kmer.freq.matrix<-t(array(kmer.freq.one.seq3,dim=c(length(kmer.freq.one.seq3)%/%length(kmer.freq.one.seq),length(kmer.freq.one.seq)))) ### error!?
rownames(kmer.freq.matrix2)<-rownames(que)
out<-list(unique.str=u.s,kmer.Freq.ref=kmer.freq.matrix, kmer.Freq.que=kmer.freq.matrix2)
#cat(kmer.freq.matrix)
#return(kmer.freq.matrix)
return(out)
}
char2NumVector<-function(c){
if (class(c)!="character") c<-as.character(c)
#stop("invalid input format! input should be character vector!")
c<-as.factor(c)
level.c<-levels(c)
#b<-dim(ecol.sample.list)[1]
levels(c)<-seq(1:length(level.c))
c<-as.numeric(c)
return(c)}
strings.equal<-function(str1,str2){ifelse(str1==str2,1,0)}
##########
c2s<-function (chars = c("m", "e", "r", "g", "e", "d")) {
return(paste(chars, collapse = ""))
}
##########
NAMES<-function(seqs){
if(mode(seqs)=="raw"){
SeqNames<-attr(seqs,"dimnames")[[1]]
}else{ ### mode(seqs)=="list"
SeqNames<-names(seqs)
}
#names(SeqNames)<-NULL
if(length(SeqNames)==0) {
stop("the mode(seqs) is wrong!")
}else{
return(SeqNames) }
}
if(UseBuiltModel == FALSE){
sampleSpeNames<-attr(ref,"dimnames")[[1]]
if(length(sampleSpeNames)==0) sampleSpeNames<-names(ref)
queIDs<-attr(que,"dimnames")[[1]]
if(length(queIDs)==0) queIDs<-names(que)
mpattern<-".+,"
Spp<-gsub(mpattern,"",sampleSpeNames) # remove seqs names before "," (incl.",")
#Spp
#length(unique(Spp))
sampleSpeNames2<-class.ind(Spp)
#if (kmer>0.05*len.shortest.seq)
# stop("kmer is too large, it will take lots of time to run! a vaule less than 10 is suggested!")
out1<-DNAbin2kmerFreqMatrix2(ref,que,kmer=kmer)
unique.str.ref<-out1$unique.str
# bp<-function(ref1,sampleSpeNames2,que1,unique.str.ref,lr=5e-5, maxit=1e+6){
bp<-function(ref1,sampleSpeNames2,que1,lr=5e-5, maxit=1e+6){
#unique.str.ref<-out1$unique.str
### standard
center.ref1<-apply(ref1,MARGIN=2,FUN=mean)
sd.ref1<-apply(ref1,MARGIN=2,FUN=sd)
ref1<-scale(ref1,scale=F)
que1<-scale(que1,center = center.ref1, scale=F)
#que1<-scale(que1,center = center.ref1, scale=sd.ref1)
ref1_tmp<-ref1[!is.na(ref1)]
range1<-1./max(abs(ref1_tmp))
# range1
n.hidden<-ceiling(log2(dim(ref1)[1]))
nnet.trained <- nnet(ref1, sampleSpeNames2, size = n.hidden, rang = range1,### 0.5
entropy = FALSE,
decay = lr, maxit = maxit,#decay = 5e-5, maxit = 1e+6,
abstol = 1.0e-8,
reltol = 1.0e-8,
MaxNWts = 20000) ### decay = 5e-5
spe.inferred0<-predict(nnet.trained, que1)
spe.inferred<-spe.inferred0
#spe.inferred[spe.inferred>=0.95]<-1
#spe.inferred[spe.inferred<0.95]<-0
#colnames(spe.inferred)
#which.max(spe.inferred[1,])
inferred<-apply(spe.inferred,1,FUN=which.max)
inferred.prob<-apply(spe.inferred,1,FUN=max)
# length(inferred)
#inferred
#colnames(spe.inferred)[inferred]
############################################################
####### calculate model success rate using ref start...
###########################################################
spe.inferred0.ref<-predict(nnet.trained, ref1)
spe.inferred.ref<-spe.inferred0.ref
inferred.ref<-apply(spe.inferred.ref,1,FUN=which.max)
inferred.prob.ref<-apply(spe.inferred.ref,1,FUN=max)
# length(inferred)
#inferred
p.ref<-colnames(spe.inferred.ref)[inferred.ref]
#p.ref[1]<-"abz"
spe.morph.Identified<-data.frame(Spp,p.ref,stringsAsFactors=TRUE)
matches<-apply(spe.morph.Identified,2,strings.equal,str2=spe.morph.Identified[,2])
matches<-colSums(matches,dims = 1)
success.rates.ref<-matches[1]/matches[2]
names(success.rates.ref)<-NULL
############################################################
####### calculate model success rate using ref the end.
###########################################################
output.identified<-data.frame(queIDs=queIDs,
spe.Identified=colnames(spe.inferred)[inferred],
bp.prob=inferred.prob,stringsAsFactors=TRUE)
#output.identified
rownames(output.identified)<-NULL
out.bp<-list(summary.model=summary(nnet.trained),###convergence=nnet.trained$convergence,
convergence=nnet.trained$convergence,
success.rates.ref=success.rates.ref,
kme.ref=kmer,
output_identified=output.identified,
nnet.trained=nnet.trained,
center.ref1=center.ref1
)
#current.wd<-getwd()
#setwd
return(out.bp)
#return(output.identified)
}
output2.identified<-bp(ref1=out1$kmer.Freq.ref,
sampleSpeNames2,
que1=out1$kmer.Freq.que,
#unique.str.ref=out1$unique.str,
lr=5e-5,
maxit=1e+6)
#fileName<-"bbsik_tmp"
#fileName<-paste("simulation",i,sep = "")
#fileName<-paste(fileName,".RData",sep = "")
#Rhome<-R.home() ### 2020/4/13 21:50:29
Rhome<-tempdir() ### 2020/4/14 21:31:02
fileName<-"bbsik_tmp"
fileName<-paste(Rhome,fileName,sep = "/")### 2020/4/13 21:50:34
fileName
fileName<-paste(fileName,".RData",sep = "")
fileName
save(output2.identified,
#nnet.trained,
# out.bp,
# success.rates.ref,
# kmer,
unique.str.ref,
# center.ref1,
file = fileName) ## #save(x, y, file = "xy.RData")
}else{
##################### ### 2020/4/13 21:57:42
#Rhome<-R.home() ### 2020/4/13 21:57:47
Rhome<-tempdir() ### 2020/4/14 21:31:02
fileName<-"bbsik_tmp"
fileName<-paste(Rhome,fileName,sep = "/")### 2020/4/13 21:06:46
fileName
fileName<-paste(fileName,".RData",sep = "")
fileName
############################ 2020/4/13 21:57:50
#if(!file.exists("bbsik_tmp.RData"))
if(!file.exists(fileName)) ### 2020/4/13 21:52:14
stop("file bbsik_tmp.RData not found in R.home directory!
you need to rebuild the model!
")
### 4. calculate kmer frequency for each sequence in que
#load("bbsik_tmp.RData")
load(fileName) ####2020/4/13 21:58:54
center.ref1<-output2.identified$center.ref1
nnet.trained<-output2.identified$nnet.trained
success.rates.ref<-output2.identified$success.rates.ref
queIDs<-attr(que,"dimnames")[[1]]
if(length(queIDs)==0) queIDs<-names(que)
#u.s<-unique.str.ref
u.s<-unique.str.ref
seqs.as.char2<-as.character(que)
ifelse(is.vector(seqs.as.char2),seqs.as.str.vector2<-lapply(seqs.as.char2,FUN=c2s),seqs.as.str.vector2<-apply(seqs.as.char2, MARGIN=1,FUN=c2s))
kmer.freq.one.seq<-sapply(u.s,gregexpr,seqs.as.str.vector2[1])
kmer.freq.one.seq2<-numeric(length(kmer.freq.one.seq))
for (i in 1:length(kmer.freq.one.seq)){
#kmer.freq.one.seq2[i]<-length(kmer.freq.one.seq[[i]])
ifelse(kmer.freq.one.seq[[i]]==-1,kmer.freq.one.seq2[i]<-0,kmer.freq.one.seq2[i]<-length(kmer.freq.one.seq[[i]]))
}
kmer.freq.one.seq3<- kmer.freq.one.seq2
for (k in 2:length(seqs.as.str.vector2)){
# k<-2
kmer.freq.one.seq<-sapply(u.s,gregexpr,seqs.as.str.vector2[k])
kmer.freq.one.seq2<-numeric(length(kmer.freq.one.seq))
for (i in 1:length(kmer.freq.one.seq)){
ifelse(kmer.freq.one.seq[[i]]==-1,kmer.freq.one.seq2[i]<-0,kmer.freq.one.seq2[i]<-length(kmer.freq.one.seq[[i]]))
}
kmer.freq.one.seq3<- c(kmer.freq.one.seq3,kmer.freq.one.seq2)
} ### end of k-for-loop
kmer.freq.matrix2<-t(array(kmer.freq.one.seq3,dim=c(length(kmer.freq.one.seq),length(kmer.freq.one.seq3)%/%length(kmer.freq.one.seq))))
rownames(kmer.freq.matrix2)<-rownames(que)
if(length(rownames(kmer.freq.matrix2))==0) rownames(kmer.freq.matrix2)<-names(que)
que1<-scale(kmer.freq.matrix2,center = center.ref1, scale=F)
spe.inferred0<-predict(nnet.trained, que1)
spe.inferred<-spe.inferred0
inferred<-apply(spe.inferred,1,FUN=which.max)
inferred.prob<-apply(spe.inferred,1,FUN=max)
output.identified<-data.frame(queIDs=queIDs,
spe.Identified=colnames(spe.inferred)[inferred],
bp.prob=inferred.prob,stringsAsFactors=TRUE)
rownames(output.identified)<-NULL
output2.identified<-list(summary.model=summary(nnet.trained),
convergence=nnet.trained$convergence,
success.rates.ref=success.rates.ref,
kme.ref=kmer,
output_identified=output.identified)
}
class(output2.identified) <- c("BarcodingR")
return(output2.identified)
} ### the end of the function!
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.