inst/extcode/blastEnrichedkmer.R

#' Identify enriched kmers and blast the sequence.
#'
#' @docType methods
#' @name blastEnrichedKmer
#' @rdname blastEnrichedKmer
#'
#' @param kmerSum The path to control fastq.
#' @param outdir The path to output results.
#' @param refseq_fa The path to refseq fasta file.
#' @param blast_dir The path to blast API.
#'
#' @author Wubing Zhang
#' @import Biostrings
#' @import rBLAST
#' @export
blastEnrichedKmer <- function(kmerSum, prefix = "bacTB", outdir = "./",
                              refseq_fa = "~/Jobs/Project/XunBaihui/_Data/6_PhageDisplay/TB_Proteosome/UP000001584_83332.fasta",
                              blast_dir = "/Applications/ncbi-blast-2.9.0+/bin/"){
  ## Identify enriched kmers and blast the sequence.
  message(Sys.time(), " Identify enriched kmers and blast the sequence ...")
  kmerSum = read.table(kmerSum, row.names = 1, header = TRUE, stringsAsFactors = FALSE)
  samples = unique(gsub("\\.p$|\\.lfc$", "", colnames(kmerSum)))
  for(s in samples){
    tmpRes = kmerSum[, paste0(s, c(".lfc", ".p"))]
    colnames(tmpRes) = c("lfc", "pvalue")
    tmpRes = as.data.frame(tmpRes)
    enrichedPeptide = rownames(tmpRes)[tmpRes$lfc>1 & tmpRes$pvalue<0.01]
    blastRes = blastSeq(enrichedPeptide, db = refseq_fa, blast_dir=blast_dir)
    ## Visualize the blast results.
    k = nchar(rownames(kmerSum)[1])
    refseq = readAAStringSet(refseq_fa)
    refseq = as.data.frame(refseq)
    refseq$end = (cumsum(c(1, nchar(refseq$x)))-1)[-1]
    refseq$start = cumsum(c(1, nchar(refseq$x)))[-(nrow(refseq)+1)]
    refseq$Name = gsub(".*MYCTU | OS=.*", "", rownames(refseq))
    rownames(refseq) = gsub(" .*", "", rownames(refseq))
    colnames(refseq)[1] = "sequence"
    blastRes = blastRes[blastRes$Perc.Ident==100&blastRes$Alignment.Length>(k-1), ]
    if(nrow(blastRes)>0){
      blastRes$SubjectID = as.character(blastRes$SubjectID)
      blastRes$QueryID = as.character(blastRes$QueryID)
      blastRes$Name = refseq[blastRes$SubjectID, "Name"]
      blastRes$Start = refseq[blastRes$SubjectID, "start"] + blastRes$S.start - 1
      blastRes$SubjectID = gsub("..\\|", "", gsub("\\|[[:alnum:]]*_[[:alnum:]]*$", "", blastRes$SubjectID))
      blastRes$ID = paste0(blastRes$QueryID, "_", blastRes$SubjectID, "_", blastRes$Name)
      blastRes$lfc = tmpRes[blastRes$QueryID, "lfc"]
      gg = blastRes[, c("ID", "Start", "lfc")]
      write.table(gg, paste0(outdir, "/", prefix, "_", s, "_", k, "mer_blastRes.txt"),
                  row.names = FALSE, sep = "\t", quote = FALSE)
      p = dotView(gg, y = "lfc")
      p = p + xlim(0, max(refseq$end))
      ggsave(paste0(outdir, "/", "dotView_", prefix, "_", s, "_", k, "mer.png"), p,
             width = 12, height = 7, dpi = 200)
    }
  }
}
WubingZhang/PhageR documentation built on July 2, 2019, 9:03 p.m.