inst/extcode/LibQC.R

#' Phage library QC
#'
#' @docType methods
#' @name LibQC
#' @rdname LibQC
#'
#' @param lib_fastq The path to library fastq.
#' @param outdir The output directory.
#' @param len The length of peptides.
#' @importFrom MAGeCKFlute MapRatesView
#' @export

LibQC <- function(lib_fastq, outdir = ".", len = 12){
  message(Sys.time(), " Extract reads from ", lib_fastq, " ...")
  reads = extractReads(lib_fastq)
  nreads = length(reads)
  ## Remove reads with "N" or have different length
  idx = grepl("N", reads) | nchar(reads)!=len*3
  reads = reads[!idx]

  ## Translate the DNA sequence to Amino acid sequence
  message(Sys.time(), " Translate the DNA sequence to Amino acid sequence ...")
  peptides = NucleoTranslate(reads)

  ## Remove peptide with stop codon
  message(Sys.time(), " Remove reads with stop codon ", lib_fastq, " ...")
  idx = grepl("X", peptides)
  peptides = peptides[!idx]
  reads = reads[!idx]

  ## Ratio of useless peptide
  gg = data.frame(Label = c("Library"), Reads = nreads,
                  RemainPeptide = length(peptides))
  p = MAGeCKFlute::MapRatesView(gg, Mapped = "RemainPeptide")
  p = p + scale_fill_manual(values = c("#9BC7E9", "#1C6DAB"),
                            label = c("Total peptide", "Remain peptide"))
  p = p + theme(axis.text.x = element_text(angle = 0, hjust = 1, vjust = 1))
  p = p + theme(legend.position = "bottom")
  ggsave(file.path(outdir, "MapRatesView_PhageLibrary.pdf"), p,
         width = 5, height = 4, dpi = 200)

  ## Save library as table
  message(Sys.time(), " Save library as data frame ", lib_fastq, " ...")
  lib = data.frame(id = reads, sequence = reads, peptide = peptides)
  lib = lib[!duplicated(lib$id), ]
  lib$id = 1:nrow(lib)
  write.table(lib, paste0(outdir, "/PhageDisplay_Library.txt"), sep = "\t",
              row.names = FALSE, col.names = FALSE, quote = FALSE)

  ## Distribution of nucleotide on each position
  message(Sys.time(), " Plot the nucleotide distribution of ", lib_fastq, " ...")
  suffix = "Library"
  distr_pos = tableSeqs(reads)
  distr_pos = distr_pos[, sort(colnames(distr_pos))]
  saveRDS(distr_pos, paste0(outdir, "/Distr_Nucleotide_", suffix, ".rds"))

  p2 = polylineView(distr_pos, xlab = "Position", ylab = "Read count")
  p2 = p2 + theme(legend.position = "bottom")
  ggsave(paste0(outdir, "/Polyline_Nucleotide_", suffix, ".pdf"), p2,
         width = 8, height = 3, dpi = 200)

  ## Sequence logo
  ratioNT = t(distr_pos / rowSums(distr_pos))
  p = ggseqlogo(ratioNT, method='custom', seq_type='dna')
  ggsave(paste0(outdir, "/SeqLogo_Nucleotide_", suffix, ".pdf"),
         p, width = 9, height = 3, dpi = 200)

  ## Distribution of amino acid
  message(Sys.time(), " Plot the AA distribution of ", lib_fastq, " ...")
  distr_pos = tableSeqs(peptides)
  distr_pos = distr_pos[, sort(colnames(distr_pos))]
  saveRDS(distr_pos, paste0(outdir, "/Distr_AminoAcid_", suffix, ".rds"))

  p2 = polylineView(distr_pos, xlab = "Position", ylab = "Read count")
  p2 = p2 + theme(legend.position = "bottom")
  p2 = p2 + guides(col = guide_legend(nrow = 2, byrow = TRUE))
  ggsave(paste0(outdir, "/Polyline_AminoAcid_", suffix, ".pdf"), p2,
         width = 8, height = 3, dpi = 200)

  ## Sequence logo
  ratioNT = t(distr_pos / rowSums(distr_pos))
  p = ggseqlogo(ratioNT, method='custom', seq_type='aa')
  ggsave(paste0(outdir, "/SeqLogo_AminoAcid_", suffix, ".pdf"),
         p, width = 8, height = 10, dpi = 200)

  message(Sys.time(), " Library QC is done ...")
}
WubingZhang/PhageR documentation built on July 2, 2019, 9:03 p.m.