Nothing
#' Species Identification Based on Fuzzy-set Method and kmer
#' @description Species identification based on fuzzy-set method (Zhang et al. 2012)and kmer.
#' @param ref object of class "DNAbin" used as a reference dataset, which contains taxon information.
#' @param que object of class "DNAbin", whose identities (species names) need to be inferred.
#' @param kmer a numeric to indicate the length of maximum kmer to try in the range of 1 to kmer in case of
#' optimization = TRUE, otherwise, only a certain length of kmer is used.
#' @param optimization a character string, indicating whether different length of kmer (up to kmer) will be used or
#' just a specified length of kmer will be used.
#' @return a list indicating the identified species.
#' @keywords barcoding.spe.identify2
#' @export
#' @import class
#' @import sp
#' @author Ai-bing ZHANG, Cai-qing YANG, Meng-di HAO, CNU, Beijing, CHINA, contact at zhangab2008 (at) mail. cnu. edu. cn.
#' @note read.dna() from package {ape} was used to obtain DNAbin object for unaligned non-coding barcodes.
#'
#' @references
#'
#' Zhang, A. B., M. D. Hao, C. Q. Yang, and Z. Y. Shi. (2017). BarcodingR: an integrated R package for species identification using DNA barcodes. Methods Ecol Evol. 8(5):627-634.
#' https://besjournals.onlinelibrary.wiley.com/doi/10.1111/2041-210X.12682.
#'
#' Jin,Q., H.L. Han, X.M. Hu, X.H. Li,C.D. Zhu,S. Y. W. Ho, R. D. Ward, A.B. Zhang . (2013). Quantifying Species Diversity with a DNA Barcoding-Based Method: Tibetan Moth Species (Noctuidae) on the Qinghai-Tibetan Plateau. PloS One 8: e644.
#' https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0064428.
#'
#' Zhang, A. B., C. Muster, H.B. Liang, C.D. Zhu, R. Crozier, P. Wan, J. Feng, R. D. Ward.(2012). A fuzzy-set-theory-based approach to analyse species membership in DNA barcoding. Molecular Ecology, 21(8):1848-63.
#' https://onlinelibrary.wiley.com/doi/10.1111/j.1365-294X.2011.05235.x
#'
#' Zhang, A. B., D. S. Sikes, C. Muster, S. Q. Li. (2008). Inferring Species Membership using DNA sequences with Back-propagation Neural Networks. Systematic Biology, 57(2):202-215.
#' https://besjournals.onlinelibrary.wiley.com/doi/10.1111/2041-210X.12682
#' @examples
#'
#' data(pineMothITS2)
#' ref<-pineMothITS2
#' que<-ref
#' spe.id<-barcoding.spe.identify2(ref,que, kmer = 1, optimization = FALSE)
#' spe.id
#'
barcoding.spe.identify2<-function (ref, que, kmer = kmer, optimization = TRUE) { # "fixedNumber","optimal"
set.seed(7)
ref.IDs<-rownames(ref)
que.IDs<-rownames(que)
if(length(ref.IDs)==0) ref.IDs<-names(ref)
if(length(que.IDs)==0) que.IDs<-names(que)
if(length(ref.IDs)!=0){
que.IDs<-rownames(que)
if(length(que.IDs)==0) que.IDs<-names(que)
#ref<-del.gaps(ref)
#que<-del.gaps(que)
#names(ref)<-ref.IDs
#names(que)<-que.IDs
}
##########
c2s<-function (chars = c("m", "e", "r", "g", "e", "d")) {
return(paste(chars, collapse = ""))
}
##########
morph.spe<-gsub(".+,","",ref.IDs) # remove sequence ID before ","
Spp2<-as.factor(morph.spe)
len.shortest.seq<-min(as.numeric(summary(ref)[,1]))
#if (kmer>0.05*len.shortest.seq)
# stop("kmer is too large, it will take lots of time to run! a vaule less than 10 is suggested!")
# optimization <- TRUE
#################################################
### functions used!
#################################################
DNAbin2kmerFreqMatrix2<-function(ref,que,kmer=kmer){
### return kmer frequency matrices for both ref and que sequences, but only based on kmers found in ref!!!
### new kmers in que will be ignored
#require(seqinr)
#kmer<-1
### 1. check the format of input arguments
if (class(ref)!="DNAbin")
stop("seqs should be in DNAbin format!")
if (class(kmer)!="integer") kmer<-as.integer(kmer)
### 2. seek unique.kmer.vector for all seqs: u.s
seqs.as.char<-as.character(ref)
seqs.as.char2<-as.character(que)
ifelse(is.vector(seqs.as.char),seqs.as.str.vector<-lapply(seqs.as.char,FUN=c2s),seqs.as.str.vector<-apply(seqs.as.char, MARGIN=1,FUN=c2s))
ifelse(is.vector(seqs.as.char2),seqs.as.str.vector2<-lapply(seqs.as.char2,FUN=c2s),seqs.as.str.vector2<-apply(seqs.as.char2, MARGIN=1,FUN=c2s))
seqs.unique.as.str.vector<-unique(seqs.as.str.vector)
s<- seqs.unique.as.str.vector
u.s0<-unique(substring(s, 1, kmer)) ### check along the column first!
u.s<-u.s0
for (i in 2:(max(nchar(s))-kmer+1)){
# i<-2
#u.s<-u.s0 ### unique str
u.s<-c(u.s,unique(substring(s, i, kmer+i-1)))
u.s<-unique(u.s)
n.char<-nchar(u.s)
size.kmer.exact<-n.char==kmer ### logic to remove short kmer!
u.s<-subset(u.s,size.kmer.exact)
} ### the end of for-loop
### do some cleaning by removing IUPAC codes, "-"
mpattern<-"-+[a-z]*"
u.s<-gsub(mpattern,NA,u.s) # strings with "-"
mpattern<-"[rymkswhbvdn]+"
u.s<-gsub(mpattern,NA,u.s) # strings with "-"
u.s<- u.s[!is.na(u.s)]
### 3. calculate kmer frequency for each sequence in ref
#b<-gregexpr(u.s[1],seqs.as.str.vector[1])
kmer.freq.one.seq<-sapply(u.s,gregexpr,seqs.as.str.vector[1])
kmer.freq.one.seq2<-numeric(length(kmer.freq.one.seq))
for (i in 1:length(kmer.freq.one.seq)){
#kmer.freq.one.seq2[i]<-length(kmer.freq.one.seq[[i]])
ifelse(kmer.freq.one.seq[[i]]==-1,kmer.freq.one.seq2[i]<-0,kmer.freq.one.seq2[i]<-length(kmer.freq.one.seq[[i]]))
}
kmer.freq.one.seq3<- kmer.freq.one.seq2
for (k in 2:length(seqs.as.str.vector)){
# k<-2
kmer.freq.one.seq<-sapply(u.s,gregexpr,seqs.as.str.vector[k])
#b<-lapply(u.s,gregexpr,seqs.as.str.vector[1])
kmer.freq.one.seq2<-numeric(length(kmer.freq.one.seq))
for (i in 1:length(kmer.freq.one.seq)){
ifelse(kmer.freq.one.seq[[i]]==-1,kmer.freq.one.seq2[i]<-0,kmer.freq.one.seq2[i]<-length(kmer.freq.one.seq[[i]]))
}
kmer.freq.one.seq3<- c(kmer.freq.one.seq3,kmer.freq.one.seq2)
} ### end of k-for-loop
kmer.freq.matrix<-t(array(kmer.freq.one.seq3,dim=c(length(kmer.freq.one.seq),length(kmer.freq.one.seq3)%/%length(kmer.freq.one.seq))))
#kmer.freq.matrix<-t(array(kmer.freq.one.seq3,dim=c(length(kmer.freq.one.seq3)%/%length(kmer.freq.one.seq),length(kmer.freq.one.seq)))) ### error!?
rownames(kmer.freq.matrix)<-rownames(ref)
if(length(rownames(kmer.freq.matrix))==0) rownames(kmer.freq.matrix)<-names(ref)
### 4. calculate kmer frequency for each sequence in que
kmer.freq.one.seq<-sapply(u.s,gregexpr,seqs.as.str.vector2[1])
kmer.freq.one.seq2<-numeric(length(kmer.freq.one.seq))
for (i in 1:length(kmer.freq.one.seq)){
#kmer.freq.one.seq2[i]<-length(kmer.freq.one.seq[[i]])
ifelse(kmer.freq.one.seq[[i]]==-1,kmer.freq.one.seq2[i]<-0,kmer.freq.one.seq2[i]<-length(kmer.freq.one.seq[[i]]))
}
kmer.freq.one.seq3<- kmer.freq.one.seq2
for (k in 2:length(seqs.as.str.vector2)){
# k<-2
kmer.freq.one.seq<-sapply(u.s,gregexpr,seqs.as.str.vector2[k])
#b<-lapply(u.s,gregexpr,seqs.as.str.vector[1])
kmer.freq.one.seq2<-numeric(length(kmer.freq.one.seq))
for (i in 1:length(kmer.freq.one.seq)){
ifelse(kmer.freq.one.seq[[i]]==-1,kmer.freq.one.seq2[i]<-0,kmer.freq.one.seq2[i]<-length(kmer.freq.one.seq[[i]]))
}
kmer.freq.one.seq3<- c(kmer.freq.one.seq3,kmer.freq.one.seq2)
} ### end of k-for-loop
kmer.freq.matrix2<-t(array(kmer.freq.one.seq3,dim=c(length(kmer.freq.one.seq),length(kmer.freq.one.seq3)%/%length(kmer.freq.one.seq))))
#kmer.freq.matrix<-t(array(kmer.freq.one.seq3,dim=c(length(kmer.freq.one.seq3)%/%length(kmer.freq.one.seq),length(kmer.freq.one.seq)))) ### error!?
rownames(kmer.freq.matrix2)<-rownames(que)
if(length(rownames(kmer.freq.matrix2))==0) rownames(kmer.freq.matrix2)<-names(que)
out<-list(unique.str=u.s,kmer.Freq.ref=kmer.freq.matrix, kmer.Freq.que=kmer.freq.matrix2)
#cat(kmer.freq.matrix)
#return(kmer.freq.matrix)
return(out)
}### the end of function
strings.equal<-function(str1,str2){ifelse(str1==str2,1,0)}
FMF<-function(xtheta12){
###
xtheta12<-as.numeric(xtheta12)
if (class(xtheta12)!="numeric" ||length(xtheta12)!=3)
stop("input should be a numeric vector with length of 3!!!")
x<-xtheta12[1]
theta1<-xtheta12[2]
theta2<-xtheta12[3]
##### test:
#x<-0.6289163
#theta1<-0.1465522
#theta2<-0.6379375
##### test:
if (x<=theta1) FMF<-1
if (x>theta1 && x<=(theta2+theta1)/2) FMF<-1-2*((x-theta1)/(theta2-theta1))^2
if (x<=theta2 && x>(theta2+theta1)/2) FMF<-2*((x-theta2)/(theta2-theta1))^2
if (x>=theta2) FMF<-0
return(FMF)
}
FMFtheta12<-function(Ref){
# if (class(seqsRef)!="matrix")
# stop("input should be an object of matrix!")
### 2.1 dealing with input DNA data!
### 2.2 calculate species center vectors
# rm(ref)
# Ref<-pineMothITS2
morph.spe<-gsub(".+,","",rownames(Ref)) # remove sequence ID before ","
if(length(morph.spe)==0) morph.spe<-gsub(".+,","",names(Ref))
#no.morph.spe<-length(unique(morph.spe))
#species.centers<-aggregate(scale(Ref),by=list(morph.spe),FUN="mean")
species.centers<-aggregate(Ref,by=list(morph.spe),FUN="mean")
list.spe<-species.centers[,1]
species.centers<-species.centers[,-1]
### 2.3 seek NN for PS (all species in this case!)
### 2.3.1.calculate pair distance of species centers
units.dist<-dist(species.centers, method = "euclidean", diag = F, upper = T, p = 2)
units.dist0<-units.dist
units.dist<-as.matrix(units.dist) ### important!
#units.dist0<-units.dist
for (i in 1: nrow(units.dist)) {units.dist[i,i]<-NA}
#####
### 2.4. look for elements (their indices) with minimal distance to each other
index1<-numeric(length(unique(morph.spe)))
index2<-index1
min.dist<-index1
for (i in 1:nrow(units.dist)){
# i<-1
index1[i]<-i
b<-which.min(units.dist[i,])
if (length(b)==0)
{index2[i]<-NA
min.dist[i]<-NA}
else {index2[i]<-b
min.dist[i]<-min(units.dist[i,],na.rm=T)}
} ### for loop
pairs<-rbind(index1,index2)
pairs<-t(pairs)
#class(pairs)
pairs<-subset(pairs,subset=!is.na(pairs[,2]))
theta1.tmp<-numeric(length(unique(morph.spe)))
Spp<-morph.spe ### seqsRef$unit.classif
#seqs<-scale(digitized.locus) ### seqsRef$data
#dim(seqs)
uniSpeNames<-unique(Spp)
#table(Spp)
f<-factor(Spp)
#####
###################################
### popSize calculation
###################################
popSize.PS<-as.numeric(table(Spp))
###################################
### theta1,2 calculation
###################################
### for i<-1
#source("eucl.dist.two.vect.R")
seqInOneSpe<-Ref[grep(levels(f)[1], Spp, value = FALSE,fixed = TRUE),]
#dim(seqInOneSpe)==NULL
ifelse(popSize.PS[1]==1,centroid.spe<-seqInOneSpe,centroid.spe<-apply(seqInOneSpe,2,mean)) ###
ifelse(popSize.PS[1]==1,centroid.spe0<-seqInOneSpe,centroid.spe0<-apply(seqInOneSpe,2,mean)) ### centroid.spe0 -
length.sites<-length(centroid.spe)
ifelse(popSize.PS[1]==1,intra.dist<-0,intra.dist<-apply(seqInOneSpe,1,eucl.dist.two.vect,v2=centroid.spe)) ### to the centroid
ifelse(popSize.PS[1]==1,theta1.tmp[1]<-0,theta1.tmp[1]<-sd(intra.dist)) ### theta1 is sligthly different from
#ifelse(popSize.PS[1]==1,theta1.tmp[1]<-0,theta1.tmp[1]<-max(intra.dist)) ### theta1 is sligthly different from
######
for(i in 2:length(levels(f))){
#i<-2
# i<-9
#i=3
cat(paste("i=",i),"\n")
# cat("\n")
#seqInOneSpe<-sDNAbin[grep(levels(f)[i], Spp, value = FALSE,fixed = TRUE),]
seqInOneSpe<-Ref[grep(levels(f)[i], Spp, value = FALSE,fixed = TRUE),]
ifelse(popSize.PS[i]==1,centroid.spe0<-seqInOneSpe,centroid.spe0<-apply(seqInOneSpe,2,mean))
#ifelse(popSize.PS[i]==1,centroid.spe<-seqInOneSpe,centroid.spe<-apply(seqInOneSpe,2,mean))
ifelse(popSize.PS[i]==1,centroid.spe<-c(centroid.spe,seqInOneSpe),centroid.spe<-c(centroid.spe,apply(seqInOneSpe,2,mean)))
#length(seqInOneSpe)
#ifelse(dim(seqInOneSpe)==NULL,theta1.tmp[i]<-0,theta1.tmp[i]<-max(dist(seqInOneSpe))) # error!
#ifelse(popSize.PS[i]==1,theta1.tmp[i]<-0,theta1.tmp[i]<-max(dist(seqInOneSpe)))
ifelse(popSize.PS[i]==1,intra.dist<-eucl.dist.two.vect(seqInOneSpe,centroid.spe0),intra.dist<-apply(seqInOneSpe,1,eucl.dist.two.vect,v2=centroid.spe0))
#intra.dist<-apply(seqInOneSpe,1,eucl.dist.two.vect,v2=centroid.spe0) ### to the centroid
#ifelse(popSize.PS[i]==1,theta1.tmp[i]<-0,theta1.tmp[i]<-max(intra.dist)) ### theta1 is sligthly different from
ifelse(popSize.PS[i]==1,theta1.tmp[i]<-0,theta1.tmp[i]<-sd(intra.dist)) ### theta1 is sligthly different from
#theta1.tmp[i]<-max(dist(seqInOneSpe))
}
centroid.spe.matrix<-t(array(centroid.spe,dim=c(length.sites,length(centroid.spe)%/%length.sites)))
### 1.2 calculate theta2 for each pairt of species
###################
#codes<-out.somu$out.som.unique$codes ### seqsRef$codes
codes<-centroid.spe.matrix
# dim(codes)
theta12.all<-data.frame(list.spe=list.spe,PS=pairs[,1],NN=pairs[,2],stringsAsFactors=TRUE)
#source("eucl.dist.two.vect.R")
theta2.tmp<-numeric(dim(pairs)[1])
for (i in 1:dim(pairs)[1]){
# i<-1
#v1<-codes[i, pairs[i,1]]
#v2<-codes[i, pairs[i,2]]
v1<-codes[pairs[i,1],]
v2<-codes[pairs[i,2],]
theta2.tmp[i]<-eucl.dist.two.vect(v1,v2)
}
theta12.all$theta1<-with(theta12.all,theta1.tmp)
theta12.all$theta2<-with(theta12.all,theta2.tmp)
theta12.all$popSize.PS<-with(theta12.all,popSize.PS)
#theta12.all
return(theta12.all)
#class(theta12.all)
} ### the end of the function
eucl.dist.two.vect<-function(v1,v2){
v1minusv2<-v1-v2
squared.v1minusv2<-v1minusv2*v1minusv2
out.sqrt<-sqrt(sum(squared.v1minusv2))
return(out.sqrt)
}### end of fucntion
DNAbin2kmerFreqMatrix<-function(ref,kmer=kmer){
### return kmer frequency matricies for both ref and que sequences, but only based on kmers found in ref!!!
### new kmers in que will be ignored
#require(seqinr)
#kmer<-1
### 1. check the format of input arguments
if (class(ref)!="DNAbin")
stop("seqs should be in DNAbin format!")
if (class(kmer)!="integer") kmer<-as.integer(kmer)
### 2. seek unique.kmer.vector for all seqs: u.s
seqs.as.char<-as.character(ref)
#seqs.as.char2<-as.character(que)
ifelse(is.vector(seqs.as.char),seqs.as.str.vector<-lapply(seqs.as.char,FUN=c2s),seqs.as.str.vector<-apply(seqs.as.char, MARGIN=1,FUN=c2s))
#ifelse(is.vector(seqs.as.char2),seqs.as.str.vector2<-lapply(seqs.as.char2,FUN=c2s),seqs.as.str.vector2<-apply(seqs.as.char2, MARGIN=1,FUN=c2s))
seqs.unique.as.str.vector<-unique(seqs.as.str.vector)
s<- seqs.unique.as.str.vector
u.s0<-unique(substring(s, 1, kmer)) ### check along the column first!
u.s<-u.s0
### do some cleaning by removing IUPAC codes, "-"
mpattern<-"-+[a-z]*"
u.s<-gsub(mpattern,NA,u.s) # strings with "-"
mpattern<-"[rymkswhbvdn]+"
u.s<-gsub(mpattern,NA,u.s) # strings with "-"
u.s<- u.s[!is.na(u.s)]
for (i in 2:(max(nchar(s))-kmer+1)){
# i<-2
#u.s<-u.s0 ### unique str
u.s<-c(u.s,unique(substring(s, i, kmer+i-1)))
u.s<-unique(u.s)
n.char<-nchar(u.s)
size.kmer.exact<-n.char==kmer ### logic to remove short kmer!
u.s<-subset(u.s,size.kmer.exact)
} ### the end of for-loop
### 3. calculate kmer frequency for each sequence in ref
#b<-gregexpr(u.s[1],seqs.as.str.vector[1])
kmer.freq.one.seq<-sapply(u.s,gregexpr,seqs.as.str.vector[1])
kmer.freq.one.seq2<-numeric(length(kmer.freq.one.seq))
for (i in 1:length(kmer.freq.one.seq)){
#kmer.freq.one.seq2[i]<-length(kmer.freq.one.seq[[i]])
ifelse(kmer.freq.one.seq[[i]]==-1,kmer.freq.one.seq2[i]<-0,kmer.freq.one.seq2[i]<-length(kmer.freq.one.seq[[i]]))
}
kmer.freq.one.seq3<- kmer.freq.one.seq2
for (k in 2:length(seqs.as.str.vector)){
# k<-2
kmer.freq.one.seq<-sapply(u.s,gregexpr,seqs.as.str.vector[k])
#b<-lapply(u.s,gregexpr,seqs.as.str.vector[1])
kmer.freq.one.seq2<-numeric(length(kmer.freq.one.seq))
for (i in 1:length(kmer.freq.one.seq)){
ifelse(kmer.freq.one.seq[[i]]==-1,kmer.freq.one.seq2[i]<-0,kmer.freq.one.seq2[i]<-length(kmer.freq.one.seq[[i]]))
}
kmer.freq.one.seq3<- c(kmer.freq.one.seq3,kmer.freq.one.seq2)
} ### end of k-for-loop
kmer.freq.matrix<-t(array(kmer.freq.one.seq3,dim=c(length(kmer.freq.one.seq),length(kmer.freq.one.seq3)%/%length(kmer.freq.one.seq))))
#kmer.freq.matrix<-t(array(kmer.freq.one.seq3,dim=c(length(kmer.freq.one.seq3)%/%length(kmer.freq.one.seq),length(kmer.freq.one.seq)))) ### error!?
rownames(kmer.freq.matrix)<-rownames(ref)
if(length(rownames(kmer.freq.matrix))==0) rownames(kmer.freq.matrix)<-names(ref)
out<-list(unique.str=u.s,kmer.Freq.ref=kmer.freq.matrix)
#cat(kmer.freq.matrix)
return(kmer.freq.matrix)
#return(out)
}#
#### optimization = FALSE, in this case, a certain kmer will be used, such as kmer = 7
if (optimization!=TRUE){
#kmer<-7
kmer.Freq.ref.que<-DNAbin2kmerFreqMatrix2(ref,que,kmer=kmer)
#class(kmer.Freq.ref.que)
#head(kmer.Freq.ref.que)
#################################################
### check model success rate for ref
#################################################
set.seed(7)
knn1<-knn(kmer.Freq.ref.que$kmer.Freq.ref,
kmer.Freq.ref.que$kmer.Freq.ref,
cl=Spp2, k = 1, l = 0, prob = FALSE, use.all = TRUE)
spe.morph<-as.character(Spp2)
spe.Identified<-as.character(knn1)
spe.morph.Identified<-data.frame(spe.morph,spe.Identified,stringsAsFactors=TRUE)
matches<-apply(spe.morph.Identified,2,strings.equal,str2=spe.morph.Identified[,2])
matches<-colSums(matches,dims = 1)
success.rates.ref<-matches[1]/matches[2]
names(success.rates.ref)<-NULL
#################################################
### make prediction for que
#################################################
#set.seed(7)
knn1<-knn(kmer.Freq.ref.que$kmer.Freq.ref,
kmer.Freq.ref.que$kmer.Freq.que,
cl=Spp2, k = 1, l = 0, prob = FALSE, use.all = TRUE)
spe.morph<-as.character(Spp2)
spe.Identified.que<-as.character(knn1)
#spe.morph.Identified<-data.frame(spe.morph,spe.Identified.que)
#matches<-apply(spe.morph.Identified,2,strings.equal,str2=spe.morph.Identified[,2])
#matches<-colSums(matches,dims = 1)
#success.rates.que<-matches[1]/matches[2]
#names(success.rates.que)<-rownames(que)
#if(length(rownames(que))==0) names(success.rates.que)<-names(que)
#que<-pineMothITS2
#################################################
### calculate theta12 for ref
ref1<-kmer.Freq.ref.que$kmer.Freq.ref
rownames(ref1)<-ref.IDs
if(length(rownames(ref1))==0) names(ref1)<-ref.IDs
FMF.theta12<-FMFtheta12(ref1)
#################################################
### calculate FMF for que identified
que1<-kmer.Freq.ref.que$kmer.Freq.que
rownames(que1)<-que.IDs
if(length(rownames(que1))==0) names(que1)<-que.IDs
FMF.que<-numeric(dim(que1)[1])
for(j in 1:dim(que1)[1]){
#j<-1
#seqInOneSpe<-ref1[grep(spe.Identified[j], FMF.theta12$list.spe, value = FALSE,fixed = TRUE),]
if(length(rownames(ref1))==0){
seqInOneSpe<-ref1[grep(spe.Identified.que[j], names(ref1), value = FALSE,fixed = TRUE),]
}else{seqInOneSpe<-ref1[grep(spe.Identified.que[j], rownames(ref1), value = FALSE,fixed = TRUE),]}
#seqInOneSpe<-ref1[grep(spe.Identified[j], rownames(ref1), value = FALSE,fixed = TRUE),]
#class(FMF.theta12$list.spe)
k<-match(x=spe.Identified.que[j], table=FMF.theta12$list.spe)
#ifelse(FMF.theta12$popSize.PS[k]==1,centroid.spe0<-seqInOneSpe,centroid.spe0<-apply(seqInOneSpe,2,mean))
ifelse(FMF.theta12$popSize.PS[k]==1,centroid.spe0<-seqInOneSpe,centroid.spe0<-apply(seqInOneSpe,2,mean))
que2PS.dist<-eucl.dist.two.vect(que1[j,],centroid.spe0)
#que2PS.dist<-0
#intra.dist<-apply(seqInOneSpe,1,eucl.dist.two.vect,v2=centroid.spe0)
xtheta12<-c(que2PS.dist,FMF.theta12$theta1[k],FMF.theta12$theta2[k])
FMF.que[j]<-FMF(xtheta12)
}
output.identified<-data.frame(queIDs=que.IDs,
spe.Identified=spe.Identified.que,
FMF=FMF.que,stringsAsFactors=TRUE)
out<-list(model.success=success.rates.ref,output_identified=output.identified)
#Bayesian.prob=Bayesian.prob)
#return(output.identified)
# return(out)
}else{
########################################
### kmer.best
########################################
#set.seed(7)
optimize.kmer<-function (ref,max.kmer=max.kmer){
#require(ape)
#set.seed(7)
NAMES<-function(seqs){
if(mode(seqs)=="raw"){
SeqNames<-attr(seqs,"dimnames")[[1]]
}else{ ### mode(seqs)=="list"
SeqNames<-names(seqs)
}
#names(SeqNames)<-NULL
if(length(SeqNames)==0) {
stop("the mode(seqs) is wrong!")
}else{
return(SeqNames) }
}### the end of the function
#seqNames<-NAMES(ref3)
#ref.IDs<-rownames(ref)
ref.IDs<-NAMES(ref)
#ref<-del.gaps(ref)
#names(ref)<-ref.IDs
morph.spe<-gsub(".+,","",ref.IDs) # remove sequence ID before ","
Spp2<-as.factor(morph.spe)
strings.equal<-function(str1,str2){ifelse(str1==str2,1,0)}
DNAbin2kmerFreqMatrix<-function(ref,kmer=kmer){
### return kmer frequency matricies for both ref and que sequences, but only based on kmers found in ref!!!
### new kmers in que will be ignored
#require(seqinr)
#kmer<-1
### 1. check the format of input arguments
if (class(ref)!="DNAbin")
stop("seqs should be in DNAbin format!")
if (class(kmer)!="integer") kmer<-as.integer(kmer)
### 2. seek unique.kmer.vector for all seqs: u.s
seqs.as.char<-as.character(ref)
#seqs.as.char2<-as.character(que)
ifelse(is.vector(seqs.as.char),seqs.as.str.vector<-lapply(seqs.as.char,FUN=c2s),seqs.as.str.vector<-apply(seqs.as.char, MARGIN=1,FUN=c2s))
#ifelse(is.vector(seqs.as.char2),seqs.as.str.vector2<-lapply(seqs.as.char2,FUN=c2s),seqs.as.str.vector2<-apply(seqs.as.char2, MARGIN=1,FUN=c2s))
seqs.unique.as.str.vector<-unique(seqs.as.str.vector)
s<- seqs.unique.as.str.vector
u.s0<-unique(substring(s, 1, kmer)) ### check along the column first!
u.s<-u.s0
for (i in 2:(max(nchar(s))-kmer+1)){
# i<-2
#u.s<-u.s0 ### unique str
u.s<-c(u.s,unique(substring(s, i, kmer+i-1)))
u.s<-unique(u.s)
n.char<-nchar(u.s)
size.kmer.exact<-n.char==kmer ### logic to remove short kmer!
u.s<-subset(u.s,size.kmer.exact)
} ### the end of for-loop
### 3. calculate kmer frequency for each sequence in ref
#b<-gregexpr(u.s[1],seqs.as.str.vector[1])
kmer.freq.one.seq<-sapply(u.s,gregexpr,seqs.as.str.vector[1])
kmer.freq.one.seq2<-numeric(length(kmer.freq.one.seq))
for (i in 1:length(kmer.freq.one.seq)){
#kmer.freq.one.seq2[i]<-length(kmer.freq.one.seq[[i]])
ifelse(kmer.freq.one.seq[[i]]==-1,kmer.freq.one.seq2[i]<-0,kmer.freq.one.seq2[i]<-length(kmer.freq.one.seq[[i]]))
}
kmer.freq.one.seq3<- kmer.freq.one.seq2
for (k in 2:length(seqs.as.str.vector)){
# k<-2
kmer.freq.one.seq<-sapply(u.s,gregexpr,seqs.as.str.vector[k])
#b<-lapply(u.s,gregexpr,seqs.as.str.vector[1])
kmer.freq.one.seq2<-numeric(length(kmer.freq.one.seq))
for (i in 1:length(kmer.freq.one.seq)){
ifelse(kmer.freq.one.seq[[i]]==-1,kmer.freq.one.seq2[i]<-0,kmer.freq.one.seq2[i]<-length(kmer.freq.one.seq[[i]]))
}
kmer.freq.one.seq3<- c(kmer.freq.one.seq3,kmer.freq.one.seq2)
} ### end of k-for-loop
kmer.freq.matrix<-t(array(kmer.freq.one.seq3,dim=c(length(kmer.freq.one.seq),length(kmer.freq.one.seq3)%/%length(kmer.freq.one.seq))))
#kmer.freq.matrix<-t(array(kmer.freq.one.seq3,dim=c(length(kmer.freq.one.seq3)%/%length(kmer.freq.one.seq),length(kmer.freq.one.seq)))) ### error!?
#rownames(kmer.freq.matrix)<-rownames(ref)
rownames(kmer.freq.matrix)<-NAMES(ref)
out<-list(unique.str=u.s,kmer.Freq.ref=kmer.freq.matrix)
#cat(kmer.freq.matrix)
return(kmer.freq.matrix)
#return(out)
}#
success.rates<-numeric(max.kmer)
for (i in 1:max.kmer){
#set.seed(7)
# i<-1
kmer.freq.ref<-DNAbin2kmerFreqMatrix(ref,kmer=i)
knn1<-knn(kmer.freq.ref, kmer.freq.ref, cl=Spp2, k = 1, l = 0, prob = FALSE, use.all = TRUE)
#knn1<-knn1(kmer.freq.ref, kmer.freq.ref, cl=Spp2)
#attributes(.Last.value)
#attributes(knn1)
#knn1
spe.morph<-as.character(Spp2)
spe.Identified<-as.character(knn1)
spe.morph.Identified<-data.frame(spe.morph,spe.Identified,stringsAsFactors=TRUE)
matches<-apply(spe.morph.Identified,2,strings.equal,str2=spe.morph.Identified[,2])
matches<-colSums(matches,dims = 1)
success.rates[i]<-matches[1]/matches[2]
#cat("i:",i,"\n")
#cat("spe.morph:",spe.morph,"\n")
#cat("spe.Identified:",spe.Identified,"\n")
#cat("success.rates[i]:",success.rates[i],"\n")
}
kmer.best<-which.max(success.rates)
### plot start
plot(1:max.kmer,success.rates,type="h",main="Success rates of spe identification with different length of kmer (ref)",
xlab="k (length of kmer)", ylab="Success rates of spe identification")
axis(1,kmer.best, paste("optimum",kmer.best,sep="\n"),col="red",font=2,col.axis="red")
points(kmer.best,max(success.rates),pch=16,col="red",cex=1.5)
### plot end
success.rates.ref<-max(success.rates)
names(success.rates.ref)<-NULL
out<-c(kmer.best,success.rates.ref)
return(out)
#return(kmer.best)
}
#kmer<-10
#kmer.optimal<-optimal.kmer(ref,max.kmer=kmer)
kmer.optimal<-optimize.kmer(ref,max.kmer=kmer)
success.rates.ref<-kmer.optimal[2]
kmer.optimal<-kmer.optimal[1]
########################################
### species identification after having got kmer.optimal
########################################
kmer.Freq.ref.que<-DNAbin2kmerFreqMatrix2(ref,que,kmer=kmer.optimal)
#################################################
### check model success rate for ref
#################################################
# knn1<-knn(kmer.Freq.ref.que$kmer.Freq.ref,
# kmer.Freq.ref.que$kmer.Freq.ref,
# cl=Spp2, k = 1, l = 0, prob = FALSE, use.all = TRUE)
# spe.morph<-as.character(Spp2)
# spe.Identified<-as.character(knn1)
# spe.morph.Identified<-data.frame(spe.morph,spe.Identified)
# matches<-apply(spe.morph.Identified,2,strings.equal,str2=spe.morph.Identified[,2])
# matches<-colSums(matches,dims = 1)
# success.rates.ref<-matches[1]/matches[2]
# names(success.rates.ref)<-NULL
#################################################
### make prediction for que
#################################################
#set.seed(7)
knn1<-knn(kmer.Freq.ref.que$kmer.Freq.ref,
kmer.Freq.ref.que$kmer.Freq.que,
cl=Spp2, k = 1, l = 0, prob = FALSE, use.all = TRUE)
spe.morph<-as.character(Spp2)
spe.Identified.que<-as.character(knn1)
#spe.morph.Identified<-data.frame(spe.morph,spe.Identified.que)
#matches<-apply(spe.morph.Identified,2,strings.equal,str2=spe.morph.Identified[,2])
#matches<-colSums(matches,dims = 1)
#success.rates.que<-matches[1]/matches[2]
#names(success.rates.que)<-rownames(que)
# if(length(names(success.rates.que))==0) names(success.rates.que)<-names(que)
#################################################
### calculate theta12 for ref
ref1<-kmer.Freq.ref.que$kmer.Freq.ref
#class(ref1)
rownames(ref1)<-ref.IDs
if(length(rownames(ref1))==0) names(ref1)<-ref.IDs
FMF.theta12<-FMFtheta12(ref1)
#################################################
### calculate FMF for que identified
que1<-kmer.Freq.ref.que$kmer.Freq.que ### matrix
class(que1)
rownames(que1)<-que.IDs
if(length(rownames(que1))==0) names(que1)<-que.IDs
FMF.que<-numeric(dim(que1)[1])
for(j in 1:dim(que1)[1]){
#j<-1
#seqInOneSpe<-ref1[grep(spe.Identified[j], FMF.theta12$list.spe, value = FALSE,fixed = TRUE),]
if(length(rownames(ref1))==0){
seqInOneSpe<-ref1[grep(spe.Identified.que[j], names(ref1), value = FALSE,fixed = TRUE),]
}else{seqInOneSpe<-ref1[grep(spe.Identified.que[j], rownames(ref1), value = FALSE,fixed = TRUE),]}
#seqInOneSpe<-ref1[grep(spe.Identified[j], rownames(ref1), value = FALSE,fixed = TRUE),]
#class(FMF.theta12$list.spe)
k<-match(x=spe.Identified.que[j], table=FMF.theta12$list.spe)
#ifelse(FMF.theta12$popSize.PS[k]==1,centroid.spe0<-seqInOneSpe,centroid.spe0<-apply(seqInOneSpe,2,mean))
ifelse(FMF.theta12$popSize.PS[k]==1,centroid.spe0<-seqInOneSpe,centroid.spe0<-apply(seqInOneSpe,2,mean))
que2PS.dist<-eucl.dist.two.vect(que1[j,],centroid.spe0)
#que2PS.dist<-0
#intra.dist<-apply(seqInOneSpe,1,eucl.dist.two.vect,v2=centroid.spe0)
xtheta12<-c(que2PS.dist,FMF.theta12$theta1[k],FMF.theta12$theta2[k])
FMF.que[j]<-FMF(xtheta12)
}
output.identified<-data.frame(queIDs=que.IDs,
spe.Identified=spe.Identified.que,
FMF=FMF.que,stringsAsFactors=TRUE)
out<-list(model.success=success.rates.ref,output_identified=output.identified)
#return(output.identified)
}
class(out) <- c("BarcodingR")
return(out)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.