R/sequenceFromMutations.R

#' produceSequenceFromHGVSCode
#'
#' Given an HGVS mutation code, produce corresponding cDNA and amino-acid sequence,
#' starting from the ATG initiator codon and up to STOP codon. If mutation clobbers the STOP codon,
#' sequence will be produced until cDNA string runs out.
#' @param hgvs Character string with  HGVS mutation code
#' @param transcriptID Character string with ENST ID to which hgvs code corresponds
#' @param wildTypeCDNA Character string with WT reference cDNA sequence
#' @param lowerCaseChange Highlight change in lowercase
#' @param debug Output debug info
#' @return list with elements cdna and aa representing new cDNA and amino acid mutated sequence
#' @import Biostrings
#' @keywords cats
#' @export
#' @examples
#' str <- produceSequenceFromHGVSCode('c.571G>C','ENST00000374840',wtCDNA)

produceSequenceFromHGVSCode <- function(hgvs, transcriptID, wildTypeCDNA, lowerCaseChange=F,debug=F) {
  # call VEP to get: start/stop positions and substitutions. Trivial for SNV's but makes life easier
  # for indels or complex substitutions
  vepData <- getVEPFromHGVS(hgvs,transcriptID,verbose=T,includeOnlyCanonicalTranscript=T)
  offset = vepData$transcript_consequences$cds_start
  alstr <- vepData$allele_string
  a<-strsplit(x = alstr,split="/")
  ref <- a[[1]][1]
  alt <- a[[1]][2]
  if (alt=="-") alt=""
  if (ref=="-") {
    ref=""
    offset <- offset+1
  }
  #eventLength <- vepData$end-vepData$start # produces 0 for snps, delLength-1 for deletions,insLength for insertions,
  eventLength <- nchar(alt)-nchar(ref)

  if (lowerCaseChange) alt<-tolower(alt)
  # sanity check: ref must match cDNA string content
  if (debug) {
    print(offset)
    print(ref)
    print(alt)
    print(eventLength)
  }

  newSequence <-paste0(substr(wildTypeCDNA, start=1, stop=offset-1),
                       alt,
                       substr(wildTypeCDNA, start=offset+nchar(ref), stop=nchar(wildTypeCDNA)))

  aaSequence <- translate(DNAString(newSequence))
  data.frame(hgvs=hgvs,cdna=as.character(newSequence),aa=as.character(aaSequence))
}


#' createCDNASequences
#'
#' Given an HGVS mutation code, produce corresponding cDNA and amino-acid sequence,
#' starting from the ATG initiator codon and up to STOP codon.
#' @param mutationList Character string vector with  HGVS mutation codes to produce
#' @param transcriptID Character string with ENST ID to which hgvs code corresponds
#' @param lowerCaseChange Boolean - Highlight change in lowercase
#' @return Data frame with fields cdna and aa representing new cDNA and amino acid mutated sequence
#' @keywords cats
#' @export
#' @examples
#' str <- createCDNASequences(c('c.571G>C','c.1559delT'),'ENST00000374840')
createCDNASequences <- function(mutationList, transcriptID,lowerCaseChange=F) {
  wtCDNA <- getCDNASequence(transcriptID)
  res <- lapply(mutationList,produceSequenceFromHGVSCode,transcriptID,wtCDNA,lowerCaseChange)
  do.call("rbind",res)
}
gdelangel/genomicRTools documentation built on May 16, 2019, 11:14 p.m.