R/PeptideCount.R

#' Display the distribution of nucleotide and amino acid.
#'
#' @docType methods
#' @name PeptideCount
#' @rdname PeptideCount
#'
#' @param fastqs paths to fastq files.
#' @param proj The project name.
#' @param outdir The output directory.
#' @param peplen Integer, specifying the length of peptide.
#' @param distrBase Boolean, specifying whether test the distribution at base level.
#' @param distrAA Boolean, specifying whether test the distribution at AA level.
#'
#' @return Count table.
#'
#' @author Wubing Zhang
#' @importFrom ggseqlogo ggseqlogo
#' @export

PeptideCount <- function(fastqs, proj = "Peptide", outdir = "./",
                         peplen = 12, distrBase = FALSE, distrAA = TRUE){
  requireNamespace("ggseqlogo")
  stat_reads = c()
  peptide_list = list()
  for(f in fastqs){
    prefix = gsub(".*/|\\..*", "", f)
    message(Sys.time(), " Process ", prefix, " ...")
    reads = extractReads(f)
    nRaw = length(reads)
    # remove reads with "N" or have different length
    reads = reads[!grepl("N", reads)]
    nN = nRaw-length(reads)
    nL = sum(nchar(reads)>peplen*3)
    nS = sum(nchar(reads)<peplen*3)
    reads = reads[nchar(reads)==peplen*3]
    if(length(reads)<10) next
    # Test the distribution of nucleotide in each position
    if(distrBase){
      message(Sys.time(), " Test the nucleotide distribution ...")
      distr_pos = tableSeqs(reads, peplen*3)
      distr_pos = distr_pos[, sort(colnames(distr_pos))]
      # p1 = dodgeBarView(distr_pos, xlab = "Position", ylab = "Read count",
      #                   filename = paste0(outdir, "/", proj, "_dodgeBar_Nucleotide_", prefix, ".pdf"),
      #                   width = 8, height = 3, dpi = 200)
      # p2 = polylineView(distr_pos, xlab = "Position", ylab = "Read count",
      #                   filename = paste0(outdir, "/", proj, "_polyline_Nucleotide_", prefix, ".pdf"),
      #                   width = 8, height = 3, dpi = 200)
      ratioNT = t(distr_pos / rowSums(distr_pos))
      p = ggseqlogo(ratioNT, method='custom', seq_type='dna')
      ggsave(paste0(outdir, "/", proj, "_SeqLogo_Nucleotide_", prefix, ".pdf"),
             p, width = 9, height = 3, dpi = 200)
    }

    # Amino acid
    message(Sys.time(), " Translate the sequences ...")
    peptides = NucleoTranslate(reads)
    idx = grepl("X", peptides)
    nStop = sum(idx)
    peptides = peptides[!idx]
    peptide_list[[f]] = peptides
    message("The number of peptides with stop codon: ", sum(idx))
    stat_reads = rbind(stat_reads, c(nRaw, nN, nL, nS, nStop, length(unique(reads)), length(unique(peptides))))
    rownames(stat_reads)[nrow(stat_reads)] = prefix
    if(length(peptides)<10) next
    if(distrAA){
      message("The distribution of AA ... ")
      distr_pos = tableSeqs(peptides)
      distr_pos = distr_pos[, sort(colnames(distr_pos))]
      # p1 = dodgeBarView(distr_pos, xlab = "Position", ylab = "Read count",
      #                   filename = paste0(outdir, "/", proj, "_dodgeBar_AminoAcid_", prefix, ".pdf"),
      #                   width = 8, height = 3, dpi = 200)
      # p2 = polylineView(distr_pos, xlab = "Position", ylab = "Read count",
      #                   filename = paste0(outdir, "/", proj, "_polyline_AminoAcid_", prefix, ".pdf"),
      #                   width = 8, height = 3, dpi = 200)
      ratioNT = t(distr_pos / rowSums(distr_pos))
      p = ggseqlogo(ratioNT, method='custom', seq_type='dna')
      ggsave(paste0(outdir, "/", proj, "_SeqLogo_AminoAcid_", prefix, ".pdf"),
             p, width = 8, height = 10, dpi = 200)
    }
  }
  # saveRDS(peptide_list, paste0(outdir, "/", proj, "_Peptides_list.rds"))
  ## Write out read level statistics ##
  message(Sys.time(), " Write out statistics ...")
  colnames(stat_reads) = c("TotalReads", "NReads", "LongerReads",
                           "ShorterReads", "ReadsWithStop", "UniqueReads", "UniquePeptides")
  write.table(stat_reads, paste0(outdir, "/", proj, "_Reads_Statistic.txt"), sep = "\t", quote = FALSE)

  ## Merge count tables ##
  message(Sys.time(), " Merge count tables ...")
  rawcount = NULL
  for(f in names(peptide_list)){
    tmpcount = as.data.frame(table(peptide_list[[f]]), stringsAsFactors = FALSE)
    colnames(tmpcount) = c("Peptide", f)
    if(is.null(rawcount)){
      rawcount = tmpcount
    }else{
      rawcount = merge(rawcount, tmpcount, all = TRUE, sort = FALSE, by = "Peptide")
    }
  }
  rownames(rawcount) = rawcount$Peptide
  rawcount = rawcount[,-1, drop = FALSE]
  rawcount[is.na(rawcount)] = 0
  colnames(rawcount) = gsub(".*\\/|.fq.*", "", colnames(rawcount))
  # write.table(rawcount, paste0(outdir, "/Summary_count.txt"),
  #             row.names = FALSE, sep = "\t", quote = FALSE)
  # return count table #
  return(rawcount)
}
WubingZhang/PhageR documentation built on July 2, 2019, 9:03 p.m.