R/FN_KTU.R

Defines functions histNdistri KTUsim.eval kaxonomy makektudb trim.primer

Documented in histNdistri kaxonomy KTUsim.eval makektudb trim.primer

#' Generate a kmer table from representative sequences
#'
#' Convert DNA sequences to tetranucleotide frequency table
#' @param repseq The fasta file path
#' @param file logical, default is TRUE, loading DNA sequences from file. if 'file=FALSE', the 'repseq' is loaded from object.
#' @return A 256 tetranucleotide combinations frequency table
#' @export
tetra.freq <- function (repseq, pscore = FALSE, file = TRUE)
{
  rev_comp <- function(x) {
    x.r <- paste0(rev(strsplit(x, "")[[1]]), collapse = "")
    x.r <- gsub("A", "1", x.r)
    x.r <- gsub("T", "2", x.r)
    x.r <- gsub("C", "3", x.r)
    x.r <- gsub("G", "4", x.r)
    x.r <- gsub("1", "T", x.r)
    x.r <- gsub("2", "A", x.r)
    x.r <- gsub("3", "G", x.r)
    x.r <- gsub("4", "C", x.r)
    return(x.r)
  }
  readseq <- function(seq) {
    x <- seq
    x <- paste0(x, collapse = "")
    x.r <- rev_comp(x)
    return(list(x, x.r))
  }
  if (isTRUE(file)) {
    scaffolds = Biostrings::readDNAStringSet(filepath = repseq,
                                             use.names = T)
  }
  else scaffolds = Biostrings::DNAStringSet(repseq)
  asv.id <- scaffolds@ranges@NAMES
  species <- lapply(scaffolds, function(x) readseq(as.character(x)))
  DNA <- c("A", "T", "C", "G")
  tetra.mer <- expand.grid(DNA, DNA, DNA, DNA)
  tetra.mer <- do.call(paste0, tetra.mer)
  tetra.table <- matrix(nrow = 256, ncol = length(species))
  rownames(tetra.table) <- tetra.mer
  if (isTRUE(pscore)) {
    for (i in 1:256) {
      for (j in 1:length(species)) {
        single.forward <- ifelse(length(grep(tetra.mer[i],species[[j]][[1]])) > 0, grep(tetra.mer[i],species[[j]][[1]]), 0)
        single.reverse <- ifelse(length(grep(tetra.mer[i],species[[j]][[2]])) > 0, grep(tetra.mer[i],species[[j]][[2]]), 0)
        tetra.table[i, j] <- (single.forward + single.reverse)
      }
    }
  }
  else if (!isTRUE(pscore)) {
    for (j in 1:length(species)) {
      #print(j)
      single.forward <- S4Vectors::elementNROWS(Biostrings::matchPDict(Biostrings::PDict(tetra.mer),Biostrings::DNAString(unlist(species[[j]][1]))))
      single.reverse <- S4Vectors::elementNROWS(Biostrings::matchPDict(Biostrings::PDict(tetra.mer),Biostrings::DNAString(unlist(species[[j]][2]))))
      tetra.table[, j] <- (single.forward + single.reverse)
    }
  }
  colnames(tetra.table) <- asv.id
  tetra.table <- prop.table(tetra.table, 2)
  return(tetra.table)
}


#' KTU clustering
#'
#' K-mer taxonomic unit clustering
#' @param repseq The fasta file path
#' @param pscore pscore=TRUE/FLASE: using k-mer present scores (tetranucleotide features 0:absent; 1:present in one direction; 2:present in both direction of a sequence)/k-mer frequency
#' @param feature.table (optional) 'data.frame' formated ASV/OTU table
#' @param write.fasta (optional) write out a representative KTU sequence fasta file
#' @param step Split searching range for optimal K. Two step searching process: large scale searching in the first round and smaller scale searching in the second round. Default is 'step=c(5,10)'.
#' @param search.min The minimum K for searching, default is 'NULL' = tip numbers at 0.03 height of cosine hierarchical clustering tree
#' @param search.max The maximum K for searching, default is 'NULL' = tip numbers at 0.015 height of cosine hierarchical clustering tree
#' @param cores Numbers of CPUs
#' @return KTU.table Aggregated KTU table
#' @return ReqSeq Representative KTU sequences
#' @return kmer.table Tetranucleotide present score table or tetranucleotide frequency table
#' @return clusters K-clusters of input features
#' @export
klustering <- function (repseq, pscore = FALSE, feature.table = NULL, write.fasta = TRUE,
                        step = c(5, 10), search.min = NULL, search.max = NULL, cores = 1)
{
  require(foreach, quietly = T)
  cl <- parallel::makeCluster(cores)
  doParallel::registerDoParallel(cl)
  rev_comp <- function(x) {
    x.r <- paste0(rev(strsplit(x, "")[[1]]), collapse = "")
    x.r <- gsub("A", "1", x.r)
    x.r <- gsub("T", "2", x.r)
    x.r <- gsub("C", "3", x.r)
    x.r <- gsub("G", "4", x.r)
    x.r <- gsub("1", "T", x.r)
    x.r <- gsub("2", "A", x.r)
    x.r <- gsub("3", "G", x.r)
    x.r <- gsub("4", "C", x.r)
    return(x.r)
  }
  readseq <- function(seq) {
    x <- seq
    x <- paste0(x, collapse = "")
    x.r <- rev_comp(x)
    return(list(x, x.r))
  }
  aggregate2df <- function(data, groups, FUN) {
    agg.data <- aggregate(data, list(groups), FUN)
    rownames(agg.data) <- agg.data[, 1]
    agg.data <- as.data.frame(t(agg.data[, -1]))
    return(agg.data)
  }
  scaffolds = Biostrings::readDNAStringSet(filepath = repseq,
                                           use.names = T)
  asv.id <- scaffolds@ranges@NAMES
  species <- lapply(scaffolds, function(x) readseq(as.character(x)))
  DNA <- c("A", "T", "C", "G")
  tetra.mer <- expand.grid(DNA, DNA, DNA, DNA)
  tetra.mer <- do.call(paste0, tetra.mer)
  tetra.table <- matrix(nrow = 256, ncol = length(species))
  rownames(tetra.table) <- tetra.mer
  if (isTRUE(pscore)) {
    for (i in 1:256) {
      for (j in 1:length(species)) {
        single.forward <- ifelse(length(grep(tetra.mer[i],species[[j]][[1]])) > 0, grep(tetra.mer[i],species[[j]][[1]]), 0)
        single.reverse <- ifelse(length(grep(tetra.mer[i],species[[j]][[2]])) > 0, grep(tetra.mer[i],species[[j]][[2]]), 0)
        tetra.table[i, j] <- (single.forward + single.reverse)
      }
    }
  }
  else if (!isTRUE(pscore)) {
    tetra.table <- foreach(j=1:length(species),.combine = cbind) %dopar%
      {
        single.forward <- S4Vectors::elementNROWS(Biostrings::matchPDict(Biostrings::PDict(tetra.mer),
                                                                         Biostrings::DNAString(unlist(species[[j]][1]))))
        single.reverse <- S4Vectors::elementNROWS(Biostrings::matchPDict(Biostrings::PDict(tetra.mer),
                                                                         Biostrings::DNAString(unlist(species[[j]][2]))))
        tetras <- (single.forward + single.reverse)
        tetras
      }
    rownames(tetra.table) <- tetra.mer
  }
  colnames(tetra.table) <- asv.id
  tetra.table <- prop.table(tetra.table, 2)
  cos <- as.dist(1 - coop::cosine(tetra.table))
  tree <- hclust(cos)
  if (is.null(search.max))
    search.max <- max(cutree(tree, h = 0.015))
  else search.max <- search.max
  if (is.null(search.min))
    search.min <- max(cutree(tree, h = 0.03))
  else search.min <- search.min
  print(paste("Searching range:",search.min,"to",search.max))
  steps <- ceiling(seq(search.min, search.max, length.out = step[1]))
  repeat {
    steps.update <- steps
    asw <- foreach::foreach(k = steps.update, .combine = c) %dopar%
      {
        temp.asw = cluster::pam(cos, k, do.swap = F,
                                pamonce = 5)$silinfo$avg.width
        temp.asw
      }
    k.best <- steps.update[order(asw, decreasing = T)][1:2]
    print(k.best)
    steps <- ceiling(seq(k.best[2], k.best[1], length.out = step[1]))
    if (all(steps == steps.update))
      break
  }
  steps <- unique(ceiling(seq(k.best[2], k.best[1], length.out = step[2])))
  rm(k.best)
  repeat {
    steps.update <- steps
    asw <- foreach::foreach(k = steps.update, .combine = c) %dopar%
      {
        temp.asw = cluster::pam(cos, k, do.swap = F,
                                pamonce = 5)$silinfo$avg.width
        temp.asw
      }
    if (length(steps.update) > 1) {
      k.best <- steps.update[order(asw, decreasing = T)][1:2]
    }
    else k.best <- c(steps.update[1], steps.update[1])
    print(k.best)
    steps <- unique(ceiling(seq(k.best[2], k.best[1], length.out = step[2])))
    if (all(steps == steps.update))
      break
  }
  k.best <- steps.update[which.max(asw)]
  print(paste("Best KTU #:", k.best))
  parallel::stopCluster(cl)
  kms <- cluster::pam(cos, k.best, do.swap = F, pamonce = 5)
  centroid <- kms$id.med
  crepseq <- sapply(species[centroid], "[[", 1)
  for (i in 1:length(centroid)) names(crepseq)[i] <- digest::digest(crepseq[i],
                                                                    algo = "md5", serialize = F)
  kmer.table <- data.frame(tetra.table[, centroid])
  colnames(kmer.table) <- names(crepseq)
  if (isTRUE(write.fasta)) {
    repseq <- matrix(rbind(paste0(">", names(crepseq)),
                           crepseq), ncol = 1)
    write(repseq, file = "ktu-sequence.fasta")
  }
  if (!is.null(feature.table)) {
    feature.table <- feature.table[match(asv.id, feature.table[,
                                                               1], nomatch = 0), ]
    otu <- feature.table[, -1]
    ktu <- as.data.frame(t(aggregate2df(otu, kms$clustering,
                                        sum)))
    rownames(ktu) <- names(crepseq)
    colnames(ktu) <- colnames(otu)
    return(list(KTU.table = ktu, ReqSeq = crepseq, kmer.table = kmer.table,
                clusters = kms$clustering))
  }
  else return(list(ReqSeq = crepseq, kmer.table = kmer.table,
                   clusters = kms$clustering))
}



#' Prepare KTU taxonomy assignment database by trimming primers
#'
#' Trim primer sequences of full length 16S/18S/target gene database by PCR primers
#' @param fasta input database fasta file
#' @param forseq forward primer sequence (5' -> 3')
#' @param revseq reverse primer sequence (5' -> 3')
#' @param output.name (optional) the name for output file
#' @param progress show the processing progress
#' @export
trim.primer <- function(fasta,forseq,revseq,output.name=NULL,progress=TRUE){
  #read fasta file
  fastasets <- Biostrings::readDNAStringSet(fasta)
  seqNames <- fastasets@ranges@NAMES
  #format primer seq & reverse complement
  Lpattern=Biostrings::DNAString(forseq)
  Rpattern=Biostrings::reverseComplement(Biostrings::DNAString(revseq))
  #search primer position
  Forward <- Biostrings::vmatchPattern(pattern = Lpattern,subject = fastasets,fixed = F)
  Reverse <- Biostrings::vmatchPattern(pattern = Rpattern,subject = fastasets,fixed = F)

  trimmed <- c()
  for(i in 1:length(fastasets)){
    if(length(Forward[[i]]) > 0 & length(Reverse[[i]]) >0 ){
      Fstart <- ifelse(length(Forward[[i]])>1,Forward[[i]][1]@start,Forward[[i]]@start)
      Fwidth <- ifelse(length(Forward[[i]])>1,Forward[[i]][1]@width,Forward[[i]]@width)
      Rend   <- ifelse(length(Reverse[[i]])>1,Reverse[[i]][length(Reverse[[i]])]@start,Reverse[[i]]@start)
      if(Rend-(Fstart+Fwidth)>0){
      trimmed[i] <- as.character(fastasets[[i]][(Fstart+Fwidth):(Rend-1)])
      } else trimmed[i] <- NA
    } else trimmed[i] <- NA

    if(progress==TRUE){
      print(paste0(i,"/",length(fastasets)," sequences from DB are processed"))
      flush.console()
      Sys.sleep(0.01)
    }
  }

  names(trimmed) <- seqNames
  trimmed <- trimmed[!is.na(trimmed)]
  output <- matrix(rbind(paste0(">",names(trimmed)),trimmed),ncol=1)
  if(is.null(output.name)){
    write(output,file = "trimmed-sequence.fasta")
  } else if(!is.null(output.name)) write(output,file = paste0(output.name,"_trimmed-sequence.fasta"))
}

#' Build KTU taxonomy assignment database by tetranucleotide frequency table
#'
#' make KTU db
#' @param input.fasta input database fasta file (trimmed)
#' @param input.taxa input database taxonomy file
#' @param pscore pscore=TRUE/FLASE: using k-mer present scores (tetranucleotide features 0:absent; 1:present in one direction; 2:present in both direction of a sequence)/k-mer frequency
#' @param output.file (optional) the directory for output RDS format file
#' @export
makektudb <- function(input.fasta,input.taxa,pscore=FALSE,output.file=NULL){
  rev_comp <- function(x){
    x.r <- paste0(rev(strsplit(x,"")[[1]]),collapse = "")
    x.r <- gsub("A","1",x.r);x.r <- gsub("T","2",x.r);x.r <- gsub("C","3",x.r);x.r <- gsub("G","4",x.r)
    x.r <- gsub("1","T",x.r);x.r <- gsub("2","A",x.r);x.r <- gsub("3","G",x.r);x.r <- gsub("4","C",x.r)
    return(x.r)
  }
  readseq <- function(seq){
    x <- seq
    x <- paste0(x,collapse = "")
    x.r <- rev_comp(x)
    return(list(x,x.r))
  }
  scaffolds = Biostrings::readDNAStringSet(filepath = input.fasta, use.names = T)

  DNA <- c("A","T","C","G")
  tetra.mer <- expand.grid(DNA,DNA,DNA,DNA)
  tetra.mer <- do.call(paste0,tetra.mer)

  species <- lapply(scaffolds, readseq)

  tetra.table <- matrix(nrow=256,ncol=length(species))
  rownames(tetra.table) <- tetra.mer

  if(isTRUE(pscore)){
    for(i in 1:256){
      for(j in 1:length(species)){
        single.forward <- ifelse(length(grep(tetra.mer[i],species[[j]][[1]])) > 0, grep(tetra.mer[i], species[[j]][[1]]),0)
        single.reverse <- ifelse(length(grep(tetra.mer[i],species[[j]][[2]])) > 0, grep(tetra.mer[i], species[[j]][[2]]),0)
        tetra.table[i,j] <- (single.forward+single.reverse)
      }
    }
  } else if (!isTRUE(pscore)) {
    for (j in 1:length(species)) {
      #print(j)
      single.forward <- S4Vectors::elementNROWS(Biostrings::matchPDict(Biostrings::PDict(tetra.mer),Biostrings::DNAString(unlist(species[[j]][1]))))
      single.reverse <- S4Vectors::elementNROWS(Biostrings::matchPDict(Biostrings::PDict(tetra.mer),Biostrings::DNAString(unlist(species[[j]][2]))))
      tetra.table[, j] <- (single.forward + single.reverse)
    }
  }

  colnames(tetra.table) <- scaffolds@ranges@NAMES
  tetra.table <- prop.table(tetra.table,2)

  taxa <- read.delim(input.taxa,header = F)
  taxa <- taxa[match(scaffolds@ranges@NAMES,taxa[,1]),]

  if(!is.null(output.file)){
    saveRDS(tetra.table,file = paste0(output.file,"_DB.RDS"))
    saveRDS(taxa,file = paste0(output.file,"_TX.RDS"))
  }
  return(list(tetra.table,taxa))
}

#' KTU annotation
#'
#' KTU-based alignment-free taxonomy assignment
#' @param dbRDS Kmer formated database RDS file
#' @param taxaRDS taxonomy ID RDS file
#' @param kmer.table tetranucleotide frequency table for annotation
#' @param cos.cutff cosine similarity cutoff, default=0.95
#' @param consensus taxonomy assignment by consensus [0-1]
#' @param candidate how many candidates (≥ cos.cutff) for taxonomy assignment
#' @param cores Numbers of CPUs
#' @export
kaxonomy <- function(dbRDS,taxaRDS,kmer.table,cos.cutff=0.95,consensus=0.8,candidate=10,cores=1){
  require(foreach,quietly = T)
  cl <- parallel::makeCluster(cores) #not to overload your computer
  doParallel::registerDoParallel(cl)

  db.tetra <- readRDS(dbRDS)
  db.taxa <- readRDS(taxaRDS)
  {
    taxa <- as.character(db.taxa[,2])
    sep <- ifelse(grepl("; ",taxa[1],fixed = T),"; ",";")
    taxa <- strsplit(taxa,sep)
    taxa <- data.frame(do.call(rbind,taxa),row.names = db.taxa[,1])
  }
  cos.table <- data.frame(matrix(data = NA,nrow = nrow(db.taxa),ncol=ncol(kmer.table)))
  cos.table <- foreach::foreach(cln=1:ncol(kmer.table), .combine=cbind) %dopar% {
    temp.cos.table = apply(db.tetra,2,function(x) coop::cosine(kmer.table[,cln],x)) #calling a function
    temp.cos.table #Equivalent to finalMatrix = cbind(finalMatrix, tempMatrix)
  }
  parallel::stopCluster(cl) #stop cluster

  machine.core <- function(x){
    xxx <- matrix(ncol = 8)
    xx <- cbind(taxa[which(x>=cos.cutff),],cos.score=x[which(x>=cos.cutff)])
    if(nrow(xx)>0){
      #xx <- xx[xx$cos.score>=quantile(xx$cos.score,top.pct,na.rm=TRUE),]
      ranks <- rank(1/(xx$cos.score))
      if(min(ranks)>candidate){
        xx <- xx[which(ranks==min(ranks)),]
      } else xx <- xx[which(ranks<=candidate),]
      xx[,-8] <- apply(xx[,-8], 2, as.character)
      xy <- apply(xx,2,function(x) names(table(x))[which(prop.table(table(x))>consensus)])
      xxx[,1:7] <- c(xy[[1]][1],xy[[2]][1],xy[[3]][1],xy[[4]][1],xy[[5]][1],xy[[6]][1],xy[[7]][1])
      #xxx[,8] <- mean(unlist(apply(xx,2,function(x) prop.table(table(x))[which(prop.table(table(x))>consensus)])))
      xxx[,8] <- mean(mean(xx[,8]) * unlist(apply(xx,2,function(x) prop.table(table(x))[which(prop.table(table(x))>consensus)])))
      xxx[,7] <- ifelse(!is.na(xxx[,7]) & is.na(xxx[,5]),NA,xxx[,7])
      for(i in 7:2) xxx[,i] <- ifelse(!is.na(xxx[,i]) & is.na(xxx[,i-1]),NA,xxx[,i])
    } else if(nrow(xx)==0) xxx[,c(1,8)] <- c("Unassigned",1)
    return(xxx)
  }

  taxa.consensus <- lapply(X = as.list(as.data.frame(cos.table)),
                                       FUN = machine.core)
  taxa.consensus <- data.frame(do.call(rbind,taxa.consensus))
  colnames(taxa.consensus) <- c("Kingdom","Phylum","Class","Order","Family","Genus","Species","Cos.score")
  taxa.consensus <- apply(taxa.consensus, 2, as.character)
  taxa.consensus[is.na(taxa.consensus)] <- "Unassigned"
  taxa.consensus <- as.data.frame(taxa.consensus)
  taxa.consensus[,8] <- as.numeric(as.character(taxa.consensus[,8]))
  rownames(taxa.consensus) <- colnames(kmer.table)
  return(taxa.consensus)
}


#' KTU evaluation
#'
#' Evaluate sequence similarity within KTUs
#' @param klusterRDS klustering output RDS file
#' @param ASVfasta representative ASV sequence fasta file
#' @export
KTUsim.eval <- function(klusterRDS,ASVfasta){
  kluster <- readRDS(klusterRDS)
  dna <- Biostrings::readDNAStringSet(ASVfasta,use.names = T)
  dna <- lapply(dna, as.character)
  dna <- sapply(dna, "[[", 1)
  dna <- dna[match(names(kluster$clusters),names(dna))]

  mean.simi <- c()
  Ks <- c()
  kn.stats <- table(kluster$clusters)
  for(k in 1:length(kn.stats)){
    if(kn.stats[k]>1){
      dna.cluster.n <- dna[which(kluster$clusters==k)]
      dna.cluster.cb <- combn(x = 1:length(dna.cluster.n),m = 2)
      dna.seq.cb <- apply(dna.cluster.cb,2,function(x) dna.cluster.n[x])
      within.simi <- c()
      for(i in 1:ncol(dna.seq.cb)) within.simi[i] <- Biostrings::pid(Biostrings::pairwiseAlignment(dna.seq.cb[1,i],dna.seq.cb[2,i]))
      mean.simi[k] <- mean(within.simi)

      tetra <- tetra.freq(dna[which(kluster$clusters==k)],file = F)
      Ks[k] <- mean(as.dist(1-coop::cosine(tetra)))
      Ks[k] <- ifelse(Ks[k]>0,Ks[k],0) # check no negative values
    } else if(kn.stats[k]==1){
      mean.simi[k] <- 100
      Ks[k] <- NA
    }
    print(paste("mean similarity within Kluster",k,"=",mean.simi[k],"% from",kn.stats[k],"seq features; divergence =",Ks[k]))
  }
  g.mean.simi <- mean(mean.simi)
  g.Ks <- mean(Ks[!is.na(Ks)])
  print(paste("Mean similarity of 'within KTU' of", length(kn.stats), "KTUs =",g.mean.simi,"; Mean divergence within KTU =",g.Ks))
  #hist(mean.simi)
  return(list(eachmean=data.frame(kluster=1:length(kn.stats),similarity.pct=mean.simi,divergence=Ks,n.feature=as.data.frame(kn.stats)$Freq),globalmean=g.mean.simi,globaldivergence=g.Ks))
}


#' plot KTU evaluations
#'
#' Evaluate sequence similarity within KTUs by histogram
#' @param histdata KTU evaluation variable
#' @param coll color for distribution curve
#' @export
histNdistri <- function(histdata,coll,xlab.text=NULL,legend.posit="topright",legend.text="",...){
  h <- hist(histdata,main="",xlab=xlab.text,...)
  xfit<-seq(min(histdata),max(histdata),length=100)
  yfit<-dnorm(xfit,mean=mean(histdata),sd=sd(histdata))
  yfit <- yfit*diff(h$mids[1:2])*length(histdata)
  lines(xfit,yfit,type="l",lwd=2,col=coll,xlab="",ylab="",xaxt="n",yaxt="n",bty="n",xpd=T)
  legend(x=legend.posit,legend = legend.text,bty="n")
}
poyuliu/KTU documentation built on May 23, 2022, 11:06 a.m.