R/GeneratePWAlignDict.R

#' Generate pairwise alignment dictionary between reference genome and peptides
#'
#' @docType methods
#' @name GeneratePWAlignDict
#' @rdname GeneratePWAlignDict
#'
#' @param peptides A vector of sequences.
#' @param refseq_fa A fasta file in which sequences are applied to blast.
#' @param rdsfile The path to rds file.
#' @param core The number of core used to run pairwised sequence alignment.
#'
#' @return A data frame with blast results, (peptide, refseq, score, start, end, width).
#'
#' @author Wubing Zhang
#' @import parallel
#' @export
#'
GeneratePWAlignDict <- function(peptides, refseq_fa, rdsfile=NULL, core = 2){
  require(Biostrings)
  require(parallel)
  refseq = readAAStringSet(refseq_fa)
  refseq = as.data.frame(refseq)
  refseq$ID = rownames(refseq)
  rownames(refseq) = gsub(".*\\||_.*", "", rownames(refseq))
  colnames(refseq)[1] = "sequence"
  refseq$end = cumsum(nchar(refseq$sequence))
  refseq$start = refseq$end - nchar(refseq$sequence) + 1

  if(nrow(refseq)<length(peptides)){
    pwaRes <- matrix(unlist(mclapply(1:nrow(refseq), function(i){
      tmp = pairwiseAlignment(peptides, refseq$sequence[i], type = "local")
      tmp1 = data.frame(peptide=peptides, refseq=rownames(refseq)[i],
                        score=tmp@score, stringsAsFactors = FALSE)
      tmp1 = cbind.data.frame(tmp1, tmp@subject@range)
      tmp1 = tmp1[tmp1$width>2, ,drop = FALSE]
      as.vector(t(tmp1))
    }, mc.cores = core)), ncol = 6, byrow = TRUE)
  }else{
    pwaRes <- matrix(unlist(mclapply(peptides, function(p){
      tmp = pairwiseAlignment(refseq$sequence, p, type = "local")
      tmp1 = data.frame(peptide=p, refseq=rownames(refseq),
                        score=tmp@score, stringsAsFactors = FALSE)
      tmp1 = cbind.data.frame(tmp1, tmp@pattern@range)
      tmp1 = tmp1[tmp1$width>2, ,drop = FALSE]
      as.vector(t(tmp1))
    }, mc.cores = core)), ncol = 6, byrow = TRUE)
  }
  if(nrow(pwaRes)<1) return(NULL)
  pwaRes = as.data.frame(pwaRes, stringsAsFactors = FALSE)
  colnames(pwaRes) = c("Peptide", "Refseq", "AlignScore", "Start", "End", "Width")
  pwaRes$AlignScore = as.numeric(pwaRes$AlignScore)
  pwaRes$Start = as.integer(pwaRes$Start)
  pwaRes$End = as.integer(pwaRes$End)
  pwaRes$Width = as.integer(pwaRes$Width)
  if(!is.null(rdsfile)) saveRDS(pwaRes, file = rdsfile)
  return(pwaRes)
}
WubingZhang/PhageR documentation built on July 2, 2019, 9:03 p.m.