R/PeptideDisplayPipe.R

#' The analysis pipeline of peptide display data
#'
#' @docType methods
#' @name PeptideDisplayPipe
#' @rdname PeptideDisplayPipe
#'
#' @param fastqs A vector of path to fastq files.
#' @param rawcount Raw count matrix.
#' @param proj Character, can be the experiment name.
#' @param outdir The path to output directory.
#' @param pwAlignRes A data frame.
#' @param pwAlignRDS The path to pairwise alignment dictionary.
#' @param top Integer, specifying the number of top peptides to be selected.
#' @param topnames A vector of proteins to be labeled.
#' @param ctl Index of controls.
#' @param trt Index of treatments.
#' @param peplen The length of peptides in the library.
#' @param distrAA Boolean, indicating whether test the distribution of AA in peptides.
#' @param nperm The permutation time.
#' @param core The number of core to use.
#' @param refseq_fa The path to refseq fasta file.
#'
#' @examples
#' setwd("~/Jobs/Project/XunBaihui/_Data/6_PhageDisplay/20190124/")
#' ctl_fq = "fastq/s20190124-XBH-42_TB_patient1_1st.fq.gz"
#' trt_fq = "fastq/s20190124-XBH-65_TB_patient1_3rd.fq.gz"
#' proj = "Patient1"
#' outdir = "./"
#' PeptideDisplayPipe(c(ctl_fq, trt_fq), proj=proj)
#' @author Wubing Zhang
#' @import matrixStats Biostrings data.table ggView
#' @export
#'
PeptideDisplayPipe <- function(fastqs = NULL, rawcount = NULL,
                               proj = "Peptide", outdir = "./",
                               pwAlignRes = NULL, pwAlignRDS = "Bacterial_Display_Mmu.rds",
                               top = 1000, topnames = c("PDCD1", "CTLA4", "PD1L1", "PD1L2"),
                               ctl=1, trt=2, peplen = 12, distrAA = FALSE,
                               nperm = 30, core = 2,
                               refseq_fa = "~/Jobs/Project/XunBaihui/_Data/6_PhageDisplay/Mmu_Proteosome/UP000000589_10090.fasta"){

  require(ggView)
  message(Sys.time(), " Count reads and test the distribution ...")
  if(!is.null(fastqs)){
    rawcount <- PeptideCount(fastqs=fastqs[c(ctl, trt)], proj=proj, outdir=outdir,
                             peplen=peplen, distrAA=distrAA)
    write.table(rawcount, paste0(outdir, "/", proj, "_rawcount.txt"),
                row.names = TRUE, sep = "\t", quote = FALSE)
    ctl = 1:length(ctl); trt = (length(ctl)+1):length(c(ctl,trt))
  }
  rawcount = as.matrix(rawcount)
  rawcount = rawcount[, c(ctl, trt)]
  ctl = 1:length(ctl); trt = (length(ctl)+1):ncol(rawcount)
  # normcount = log2(1e6 * t(t(rawcount) / colSums(rawcount)) + 1)
  normcount = ggView::TransformCount(rawcount, method = "vst")

  message(Sys.time(), " Generate pairwise alignment results ...")
  if(is.null(pwAlignRes)){
    if(file.exists(pwAlignRDS))
      pwAlignRes = readRDS(pwAlignRDS)
    else
      pwAlignRes = GeneratePWAlignDict(peptides = rownames(rawcount),
                                       refseq_fa = refseq_fa,
                                       rdsfile = pwAlignRDS, core = core)
  }
  kmerNormCount = kmerScan(normcount, kmer = peplen)
  message(Sys.time(), " Principle component analysis ...")
  p = pcView(kmerNormCount, ctl=ctl, trt=trt)
  ggsave(paste0(outdir, "/", proj, "_PCA_Samples.png"), p,
         width = 6, height = 4, dpi = 200)

  message(Sys.time(), " Identify enriched peptides ...")
  lfc = kmerNormCount[, trt] - kmerNormCount[, ctl]
  lfc = sort(lfc, decreasing = TRUE)
  # lfc[lfc<0] = 0
  lfc = as.matrix(lfc)
  colnames(lfc) = colnames(kmerNormCount)[trt]
  # idx = rowMeans(lfc) > lfcCutoff
  enriched_peptides = rownames(lfc)[1:top]

  message("\tThe number of enriched peptides: ", length(enriched_peptides))
  if(length(enriched_peptides)<3) next

  message(Sys.time(), " compute score for each proteinsome position ...")
  refseq = readAAStringSet(refseq_fa)
  refseq = as.data.frame(refseq)
  refseq$ID = rownames(refseq)
  rownames(refseq) = gsub(".*\\||_.*", "", rownames(refseq))
  colnames(refseq)[1] = "sequence"
  refseq$end = cumsum(nchar(refseq$sequence))
  refseq$start = refseq$end - nchar(refseq$sequence) + 1

  if(!all(enriched_peptides %in% pwAlignRes$Peptide)){
    tmp_peptides = setdiff(enriched_peptides, pwAlignRes$Peptide)
    tmp_dict = GeneratePWAlignDict(peptides = tmp_peptides,
                                   refseq_fa = refseq_fa, core = core)
    pwAlignRes = rbind.data.frame(pwAlignRes, tmp_dict)
    saveRDS(pwAlignRes, pwAlignRDS)
  }
  pwaRes = pwAlignRes[pwAlignRes$Peptide %in% enriched_peptides, ]
  pwaRes = cbind(pwaRes, lfc[pwaRes$Peptide,ncol(lfc),drop=FALSE]);
  # pwaRes = cbind(pwaRes, lfc[pwaRes$Peptide,ncol(lfc),drop=FALSE]);
  # colnames(pwaRes)[ncol(pwaRes)] = colnames(lfc)[ncol(lfc)]
  scores = PositionScore(pwaRes, refseq, core)

  message(Sys.time(), " Generate background distribution by permutation ...")
  if(file.exists(paste0(outdir, "/Background_", ceiling(length(enriched_peptides)/50), ".rds"))){
    background = readRDS(paste0(outdir, "/Background_", ceiling(length(enriched_peptides)/50), ".rds"))
  }else{
    background = sapply(1:nperm, function(x){
      random_peptide = sample(rownames(rawcount), length(enriched_peptides))
      tmpRes = pwAlignRes[pwAlignRes$Peptide %in% random_peptide, ]
      tmpRes = tmpRes[tmpRes$Refseq %in% rownames(refseq), ]
      if(nrow(tmpRes)<1) return(rep(0, nrow(scores)))
      tmpRes = cbind(tmpRes, Random = 1)
      tmpRes = PositionScore(tmpRes, refseq, core)
    })
    saveRDS(background, paste0(outdir, "/Background_", ceiling(length(enriched_peptides)/50), ".rds"))
  }

  message(Sys.time(), " Compute Z-scores for each position ...")
  tmpSds = matrixStats::rowSds(background)
  tmpSds[tmpSds==0] = min(tmpSds[tmpSds>0])
  zscores = (scores - rowMeans(background)) / tmpSds
  zscores[zscores<0] = 0

  scores = as.data.frame(scores)
  scores$position = 1:nrow(scores)
  scores$refseq = rep(rownames(refseq), refseq$end-refseq$start+1)
  scores = scores[, c(ncol(scores), ncol(scores)-1, 1:(ncol(scores)-2))]
  zscores = as.data.frame(zscores)
  zscores$position = 1:nrow(zscores)
  zscores$refseq = rep(rownames(refseq), refseq$end-refseq$start+1)
  zscores = zscores[, colnames(scores)]
  saveRDS(scores, paste0(outdir, "/", proj, "_AlignScore.rds"))
  saveRDS(zscores, paste0(outdir, "/", proj, "_AlignZScore.rds"))

  message(Sys.time(), " Visualize the top enriched proteins ...")
  gg = scores
  gg = cbind(gg, Background = rowMeans(background))
  p = eProteinView(gg, top = 20, topnames = topnames)
  ggsave(paste0(outdir, "/", proj, "_summary.png"), p,
         width = 20, height = 5, dpi = 200)
  # if(nrow(lfc_bac)<4) next
  # message(Sys.time(), " Enriched ", k, "mer clustering ...")
  # clust = msaClusterView(peptides=rownames(lfc_bac)[1:min(100,nrow(lfc_bac))], proj=proj, outdir=outdir)
  # saveRDS(clust, paste0(outdir, "/", proj, "_hclust.rds"))
}
WubingZhang/PhageR documentation built on July 2, 2019, 9:03 p.m.